# This notebook is converting already download data into parquet

In [6]:
# pyarrow and fastparquet are engines required for pandas to convert to parquet
# 1.16.1 because of https://github.com/scikit-learn-contrib/hdbscan/issues/272. 
# First install numpy and then pandas==0.25.3, because pandas uses numpy to compile and build
! pip install numpy==1.16.1
! pip install pandas==0.25.3
! pip install pyarrow
! pip install fastparquet
! pip install xlrd

Collecting xlrd
  Using cached https://files.pythonhosted.org/packages/b0/16/63576a1a001752e34bf8ea62e367997530dc553b689356b9879339cf45a4/xlrd-1.2.0-py2.py3-none-any.whl
Installing collected packages: xlrd
Successfully installed xlrd-1.2.0


# Define constants

In [2]:
import os
FOLDER_NAME = "LCAProgramsH1BH1B1E3"
# YEAR_MONTH = datetime.today().strftime("%Y-%m")
YEAR_MONTH = "2019-10"
# Assuming all data is inside us-visa-data folder
DOWNLOAD_DIR = os.getenv('DOWNLOAD_DIR', '../us-visa-data')

### Convert all data to parquet for efficient access and analytics
- Define a function, that takes file, folder name, year_month
- Use the function return_data_types to enforce data types for all columns, because saving to parquet with mixed data types, non-homonenous values etc. is a pain to fix at execution time. Just narrow down all data types (big manual work) and life gets easier 
- Saves the parquet file to the same location in which data file exists
- As soon as you save parquet file, read it back and ensure that there is no data loss i.e. original and parquet contain same data

In [3]:
import pandas as pd
import json
def return_data_types(layout_file_name, folder_name, year_month):
    
    # Get download directory if DOWNLOAD_DIR is set, otherwise current directory. 
    # Directory is relative to where this notebook exists
    DOWNLOAD_DIR = os.getenv('DOWNLOAD_DIR', "../us-visa-data")
    
    df_data_types = pd.read_excel(DOWNLOAD_DIR + "/" + folder_name + "/" + year_month + "/data_types.xlsx", sheet_name= folder_name, header=None)
    # Get the row where first column matches the input string in this case file layout and coerce to a list
    my_keys = df_data_types.loc[df_data_types.iloc[:,0]==layout_file_name].values[0].tolist()
    print("Keys for {0}: {1}".format(layout_file_name, my_keys))
    # Remove the first element, as it is the filename
    del my_keys[-1]
    if(len(my_keys) != 0):
        # +1 because the dtypes are actually in the next row
        my_values = df_data_types.loc[df_data_types.loc[df_data_types.iloc[:,0]==layout_file_name].index[0]+1].values.tolist()
        # Remove the first element, as it just a placeholder
        del my_values[-1]
        if(len(my_keys)==len(my_values)):
            print("Data types: {0}".format(dict(zip(my_keys, my_values))))
            return dict(zip(my_keys, my_values))
        else:
            print("Length of {keys} is NOT equal to length of {vaules}".format(keys=my_keys,values=my_values))
    else:
        print("Data types for {0} not found in data_types.xlsx".format(layout_file_name))


In [4]:
import pathlib
import os
import pandas as pd
def convert_file_to_parquet_delimited(file, layout_file_name, folder_name, year_month):
    file = file.split('/')[-1]
    actual_filename, actual_file_extension = os.path.splitext(file)

    # Assuming all data is inside us-visa-data folder
    DOWNLOAD_DIR = os.getenv('DOWNLOAD_DIR', '../us-visa-data')

    pathlib.Path(DOWNLOAD_DIR + "/" + folder_name + "/" + year_month + "/downloads/parquet/").mkdir(
    parents=True, exist_ok=True)
    
    # Convert only the actual data files, no need to convert .doc, .pdf
    if(actual_file_extension == '.xlsx'):
        if(os.path.exists(DOWNLOAD_DIR + "/" + folder_name + "/" + year_month + "/downloads/parquet/" + actual_filename + ".snappy.parquet") or os.path.exists(DOWNLOAD_DIR + "/" + folder_name + "/" + year_month + "/downloads/parquet/" + actual_filename + ".gzip.parquet")):
            print("{0} already exists".format(actual_filename + ".snappy.parquet"))
        else:
            print("Reading excel file to dataframe....{file}".format(file=actual_filename + actual_file_extension))
            df = pd.read_excel(os.path.join(DOWNLOAD_DIR + "/" + folder_name + "/" + year_month 
                                            + "/downloads/", actual_filename + actual_file_extension), 
                               dtype=return_data_types(layout_file_name, folder_name, year_month))
            print("Converting {orig_file} to parquet".format(orig_file=actual_filename + actual_file_extension))
            # Mixed data types don't save to parquet and errors. for e.g. BUSINESS_NAME had integer value and the below fails
            df.to_parquet(os.path.join(DOWNLOAD_DIR + "/" + folder_name + "/" 
                                       + year_month + "/downloads/parquet/", actual_filename 
                                       + ".snappy.parquet"), compression='snappy', engine="pyarrow",index=False) # index=False so that row indexes are not saved
           # Data quality check
            df_parquet = pd.read_parquet(os.path.join(DOWNLOAD_DIR + "/" + folder_name + "/" + year_month + "/downloads/parquet/", actual_filename + ".snappy.parquet"))
            if(df.equals(df_parquet)):
                print("{orig_file} has same data and datatypes as that of converted {parquet_file}".format(orig_file=actual_filename+".xlsx", parquet_file=actual_filename+".snappy.parquet"))
            else:
                print("{orig_file} does NOT have same data or datatypes as that of converted {parquet_file}".format(orig_file=actual_filename+".xlsx", parquet_file=actual_filename+".snappy.parquet"))
                print(df.info())
                print(df_parquet.info())
            # Delete dataframe and release memory
            if not df.empty:
                del df 
            if not df_parquet.empty:
                del df_parquet

In [7]:
import re
import time
import pandas as pd
regex = re.compile(
        r'^(?:http|ftp)s?://' # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
        r'localhost|' #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)

# Read already downloaded table map
table_map = pd.read_csv(DOWNLOAD_DIR + "/" + FOLDER_NAME + "/" + YEAR_MONTH + "/" + FOLDER_NAME + "_map.csv")
files = table_map['Actual Disclosure File Name'].tolist()
print("List of files to download: {0}".format(files))
print("Corresponding layout files: {0}".format(table_map['Actual File Structure'].tolist()))

# Use the below two lines of code for testing a single file
# layout_file_name = table_map[table_map['Actual Disclosure File Name'] == 'PERM_Disclosure_Data_FY2019.xlsx']['Actual File Structure']
# convert_file_to_parquet_delimited("H-1B_Case_Data_FY2009.xlsx", "H-1B Efile Record Layout FY09.rtf", FOLDER_NAME, YEAR_MONTH)

for file in files:
    print("Processing file: {0}".format(file))
    layout_file_name = table_map[table_map['Actual Disclosure File Name'] == file]['Actual File Structure'].values[0]
    print(layout_file_name)
    convert_file_to_parquet_delimited(file, layout_file_name, FOLDER_NAME, YEAR_MONTH)
    time.sleep(5) # Sleep to give rest to the CPU ;) jk, this line is remnant from other function

# clean up and remove from memory
del table_map

List of files to download: ['H-1B_Disclosure_Data_FY2019.xlsx', 'H-1B_Disclosure_Data_FY2018_EOY.xlsx', 'H-1B_Disclosure_Data_FY17.xlsx', 'H-1B_Disclosure_Data_FY16.xlsx', 'H-1B_Disclosure_Data_FY15_Q4.xlsx', 'H-1B_FY14_Q4.xlsx', 'LCA_FY2013.xlsx', 'LCA_FY2012_Q4.xlsx', 'H-1B_iCert_LCA_FY2011_Q4.xlsx', 'H-1B_FY2010.xlsx', 'Icert_ LCA_ FY2009.xlsx', 'H-1B_Case_Data_FY2009.xlsx', 'H-1B_Case_Data_FY2008.xlsx']
Corresponding layout files: ['H-1B_FY19_Record_Layout.pdf', 'H-1B_FY18_Record_Layout.pdf', 'H-1B_FY17_Record_Layout.pdf', 'H-1B_FY16_Record_Layout.pdf', 'H-1B_FY15_Record_Layout.docx', 'H1B_FY14_Record_Layout.doc', 'LCA_Record_Layout_FY13.doc', 'LCA_Record_Layout_FY12.doc', 'H-1B_Record_Layout_FY11_Q4.doc', 'H-1B_Record_Layout_FY10.doc', 'H1B_Layout_FY09.doc', 'H-1B Efile Record Layout FY09.rtf', 'H-1B_Record_Layout_FY08.doc']
Processing file: H-1B_Disclosure_Data_FY2019.xlsx
H-1B_FY19_Record_Layout.pdf
Reading excel file to dataframe....H-1B_Disclosure_Data_FY2019.xlsx
Keys for H-1