# This notebook is converting already download data into parquet

In [2]:
# pyarrow and fastparquet are engines required for pandas to convert to parquet
# 1.16.1 because of https://github.com/scikit-learn-contrib/hdbscan/issues/272. 
# First install numpy and then pandas==0.25.2, because pandas uses numpy to compile and build
! pip install numpy==1.16.1
! pip install pandas==0.25.2
! pip install pyarrow
! pip install fastparquet

Collecting jupyter_nbextensions_configurator
[?25l  Downloading https://files.pythonhosted.org/packages/51/a3/d72d5f2dc10c5ccf5a6f4c79f636bf071a5ce462dedd07af2f70384db6cb/jupyter_nbextensions_configurator-0.4.1.tar.gz (479kB)
[K     |████████████████████████████████| 481kB 1.6MB/s eta 0:00:01
[?25hCollecting jupyter_contrib_core>=0.3.3
  Downloading https://files.pythonhosted.org/packages/e6/8f/04a752a8b66a66e7092c035e5d87d2502ac7ec07f9fb6059059b6c0dc272/jupyter_contrib_core-0.3.3-py2.py3-none-any.whl
Processing /Users/pmacharl/Library/Caches/pip/wheels/d9/45/dd/65f0b38450c47cf7e5312883deb97d065e030c5cca0a365030/PyYAML-5.1.2-cp36-cp36m-macosx_10_12_x86_64.whl


Building wheels for collected packages: jupyter-nbextensions-configurator
  Building wheel for jupyter-nbextensions-configurator (setup.py) ... [?25ldone
[?25h  Created wheel for jupyter-nbextensions-configurator: filename=jupyter_nbextensions_configurator-0.4.1-py2.py3-none-any.whl size=466145 sha256=89bbfb642cd6d17a5b912c0900ad8725cdca74f7ed3fb3bebcd04b6bde8d85b7
  Stored in directory: /Users/pmacharl/Library/Caches/pip/wheels/15/df/fe/2a74fe34709e7fdc5ae153a768675d9fda93cc7d5133ed1fb0
Successfully built jupyter-nbextensions-configurator
Installing collected packages: jupyter-contrib-core, pyyaml, jupyter-nbextensions-configurator
Successfully installed jupyter-contrib-core-0.3.3 jupyter-nbextensions-configurator-0.4.1 pyyaml-5.1.2


# Define constants

In [117]:
import os
FOLDER_NAME = "LCAProgramsH1BH1B1E3"
# YEAR_MONTH = datetime.today().strftime("%Y-%m")
YEAR_MONTH = "2019-10"
# Assuming all data is inside us-visa-data folder
DOWNLOAD_DIR = os.getenv('DOWNLOAD_DIR', '../us-visa-data')

### Convert all data to parquet for efficient access and analytics
- Define a function, that takes file, folder name, year_month
- Use the function return_data_types to enforce data types for all columns, because saving to parquet with mixed data types, non-homonenous values etc. is a pain to fix at execution time. Just narrow down all data types (big manual work) and life gets easier 
- Saves the parquet file to the same location in which data file exists
- As soon as you save parquet file, read it back and ensure that there is no data loss i.e. original and parquet contain same data

In [119]:
# There are only two layouts , hence easier to define it here itself
import pandas as pd
efile_keys = ['Submitted_Date','Case_No','Program (2007 Only)','Name','Address','Address2','City','State','Postal_Code','Nbr_Immigrants','Begin_Date','End_Date','Job_Title','Dol_Decision_Date','Certified_Begin_Date','Certified_End_Date','Job_Code','Approval_Status','Wage_Rate_1','Rate_Per_1','Max_Rate_1','Part_Time_1','City_1','State_1','Prevailing_Wage_1','Wage_Source_1','Yr_Source_Pub_1','Other_Wage_Source_1','Wage_Rate_2','Rate_Per_2','Max_Rate_2','Part_Time_2','City_2','State_2','Prevailing_Wage_2','Wage_Source_2','Yr_Source_Pub_2','Other_Wage_Source_2','Withdrawn']
efile_values = ['str' for i in efile_keys]
efile_data_types = dict(zip(keys,values))
fax_keys = ['C_num','CertCode','ReturnFax','EmpName','EmpCity','EmpAddy1','EmpAddy2','EmpState','EmpZip','WageRateFrom','WageRateTo','RatePer','PartTime','BeginDate','EndDate','JobCode','NumImmigrants','JobTitle','WorkCity_1','WorkState_1','PrevWage_1','PrevWagePer_1','WageSource_1','WorkYear1','OtherWageSource1','WorkCity2','WorkState2','PrevWage2','PrevWagePer_2','WageSource_2','WorkYear_2','OtherWageSource2','CertStart','CertEnd','Det_Date','ProcessDate']
fax_values = ['str' for i in fax_keys]
fax_data_types = dict(zip(keys,values))
d = {'efile': efile_data_types, 'fax': fax_data_types}
df_data_types = pd.DataFrame(data=d)

In [120]:
import pathlib
import requests
import os
import pandas as pd
def convert_file_to_parquet_delimited(file, layout_file_name, folder_name, year_month, send_type):
    file = file.split('/')[-1]
    actual_filename, actual_file_extension = os.path.splitext(file)

    # Assuming all data is inside us-visa-data folder
    DOWNLOAD_DIR = os.getenv('DOWNLOAD_DIR', '../us-visa-data')

    pathlib.Path(DOWNLOAD_DIR + "/" + folder_name + "/" + year_month + "/downloads/parquet/").mkdir(
    parents=True, exist_ok=True)
    
    # Convert only the actual data files all files prior to 2007 were in .txt or mdb
    if(actual_file_extension == '.txt'):
        if(os.path.exists(DOWNLOAD_DIR + "/" + folder_name + "/" + year_month + "/downloads/parquet/" + actual_filename + ".snappy.parquet") or os.path.exists(DOWNLOAD_DIR + "/" + folder_name + "/" + year_month + "/downloads/parquet/" + actual_filename + ".gzip.parquet")):
            print("{0} already exists".format(actual_filename + ".snappy.parquet"))
        else:
            print("Reading text file to dataframe....{file}".format(file=actual_filename + actual_file_extension))
            df = pd.read_csv(os.path.join(DOWNLOAD_DIR + "/" + folder_name + "/" + year_month 
                                            + "/downloads/", actual_filename + actual_file_extension), 
                               dtype=dict(df_data_types[send_type]), encoding = "iso-8859-1", low_memory=False, delimiter=",")
            print("Converting {orig_file} to parquet".format(orig_file=actual_filename + actual_file_extension))
            # Mixed data types don't save to parquet and errors. for e.g. BUSINESS_NAME had integer value and the below fails
            df.to_parquet(os.path.join(DOWNLOAD_DIR + "/" + folder_name + "/" 
                                       + year_month + "/downloads/parquet/", actual_filename 
                                       + ".snappy.parquet"), compression='snappy', engine="pyarrow",index=False) # index=False so that row indexes are not saved
           # Data quality check
            df_parquet = pd.read_parquet(os.path.join(DOWNLOAD_DIR + "/" + folder_name + "/" + year_month + "/downloads/parquet/", actual_filename + ".snappy.parquet"))
            if(df.equals(df_parquet)):
                print("{orig_file} has same data and datatypes as that of converted {parquet_file}".format(orig_file=actual_filename+".xlsx", parquet_file=actual_filename+".snappy.parquet"))
            else:
                print("{orig_file} does NOT have same data or datatypes as that of converted {parquet_file}".format(orig_file=actual_filename+".xlsx", parquet_file=actual_filename+".snappy.parquet"))
                print(df.info())
                print(df_parquet.info())
            # Delete dataframe and release memory
            if not df.empty:
                del df 
            if not df_parquet.empty:
                del df_parquet

In [121]:
import re
import time
import pandas as pd

# Read already downloaded table map
table_map = pd.read_csv(DOWNLOAD_DIR + "/" + FOLDER_NAME + "/" + YEAR_MONTH + "/" + FOLDER_NAME + "_2001to2007_map.csv")
files = table_map['Actual Disclosure File Name'].tolist()
print("List of files to download: {0}".format(files))
print("Corresponding layout files: {0}".format(table_map['Actual File Structure'].tolist()))

# Use the below two lines of code for testing a single file
# layout_file_name = table_map[table_map['Actual Disclosure File Name'] == 'PERM_Disclosure_Data_FY2019.xlsx']['Actual File Structure']
# convert_file_to_parquet_delimited("EFILE_FY2007.txt", "H-1B_Efile_Record_Layout_FY01-07.xlsx", FOLDER_NAME, YEAR_MONTH)

for file in files:
    print("Processing file: {0}".format(file))
    layout_file_name = table_map[table_map['Actual Disclosure File Name'] == file]['Actual File Structure'].values[0]
    send_type = table_map[table_map['Actual Disclosure File Name'] == file]['Type'].values[0]
    print(layout_file_name)
    convert_file_to_parquet_delimited(file, layout_file_name, FOLDER_NAME, YEAR_MONTH, send_type)
    time.sleep(5) # Sleep to give rest to the CPU ;) jk, this line is remnant from other function

# clean up and remove from memory
del table_map

List of files to download: ['EFILE_FY2007.txt', 'H1B_efile_FY06.txt', 'H1B_efile_FY05.txt', 'H1B_efile_FY04.txt', 'H1B_efile_FY03.txt', 'H1B_efile_FY02.txt', 'H1B_Fax_FY2006_External_Web.txt', 'H1B_Fax_FY2005_Download.txt', 'H1B_fax_FY04.txt', 'H1b_external_fax_FY04_Web.txt', 'H1B_Fax_FY2003_Download.txt', 'H1B_FAX_FY2002_Download.txt', 'H1B_Fax_FY2001_Download.txt']
Corresponding layout files: ['H-1B_Efile_Record_Layout_FY01-07.xlsx', 'H-1B_Efile_Record_Layout_FY01-07.xlsx', 'H-1B_Efile_Record_Layout_FY01-07.xlsx', 'H-1B_Efile_Record_Layout_FY01-07.xlsx', 'H-1B_Efile_Record_Layout_FY01-07.xlsx', 'H-1B_Efile_Record_Layout_FY01-07.xlsx', 'H-1B_Fax_Record_Layout_FY01-07.xlsx', 'H-1B_Fax_Record_Layout_FY01-07.xlsx', 'H-1B_Fax_Record_Layout_FY01-07.xlsx', 'H-1B_Fax_Record_Layout_FY01-07.xlsx', 'H-1B_Fax_Record_Layout_FY01-07.xlsx', 'H-1B_Fax_Record_Layout_FY01-07.xlsx', 'H-1B_Fax_Record_Layout_FY01-07.xlsx']
Processing file: EFILE_FY2007.txt
H-1B_Efile_Record_Layout_FY01-07.xlsx
Reading te