# ETL Development - Solve Schema Error on June file

- Base template for the etl.py
- Documents cleaning steps
- Each cleaning step is tested on the i94 dataset

In [1]:
from datetime import datetime, timedelta

import configparser
import os
from pprint import pprint

import pandas as pd, numpy as np
import matplotlib.pyplot as plt

import findspark
findspark.init()
print(findspark.find())
print(os.environ['SPARK_HOME'])
print(os.environ['JAVA_HOME'])
print(os.environ['HADOOP_HOME'])

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.functions import col, asc, desc, min, max, coalesce, lit
from pyspark.sql.types import *

pd.set_option('display.max_rows', 50)

c:\spark
c:\spark
C:\Program Files\Zulu\zulu-8-jre\
c:\Hadoop


## Configs

In [4]:
def create_spark_session(local=True):
    """
    Creates and returns spark session.
    """
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "saurfang:spark-sas7bdat:3.0.0-s_2.12") \
        .enableHiveSupport() \
        .getOrCreate()
    
    #spark.conf.set("mapreduce.fileoutputcommitter.algorithm.version", "2")
    return spark

# create spark session
spark = create_spark_session()

## Read I94

In [11]:
# define schema
# https://knowledge.udacity.com/questions/316417 - I94cit and I94 res clarification

i94_schema = StructType([
    StructField("cicid", IntegerType(), True),    # id
    StructField("i94yr", IntegerType(), True),    # Year
    StructField("i94mon", IntegerType(), True),   # Month
    StructField("i94cit", IntegerType(), True),   # Country Codes I94CIT represents the country of citizenship.
    StructField("i94res", IntegerType(), True),   # Country Codes I94RES represents the country of residence.
    StructField("i94port", StringType(), True),   # e. g. 'DTH'	=	'DUTCH HARBOR, AK  
    StructField("arrdate", IntegerType(), True),  # ARRDATE is the Arrival Date in the USA. SAS date numeric field
    StructField("i94mode", IntegerType(), True),  # Air, Sea, Land ...
    StructField("i94addr", StringType(), True),   # States: FL, ...
    StructField("depdate", IntegerType(), True),  # SAS date numeric field 
    StructField("i94bir", IntegerType(), True),   # Age of Respondent in Years
    StructField("i94visa", IntegerType(), True),  # Business, Pleasure, Student
    StructField("count", IntegerType(), True),    # COUNT - Used for summary statistics
    StructField("dtadfile", StringType(), True),  # DTADFILE - Character Date Field - Date added to I-94 Files - CIC does not use
    StructField("visapost", StringType(), True),  # VISAPOST - Department of State where where Visa was issued - CIC does not use
    StructField("occup", StringType(), True),     # OCCUP - Occupation that will be performed in U.S. - CIC does not use
    StructField("entdepa", StringType(), True),   # ENTDEPA - Arrival Flag - admitted or paroled into the U.S. - CIC does not use
    StructField("entdepd", StringType(), True),   # ENTDEPD - Departure Flag - Departed, lost I-94 or is deceased - CIC does not use
    StructField("entdepu", StringType(), True),   # ENTDEPU - Update Flag - Either apprehended, overstayed, adjusted to perm residence - CIC does not use
    StructField("matflag", StringType(), True),   # MATFLAG - Match flag - Match of arrival and departure records
    StructField("biryear", IntegerType(), True),  # BIRYEAR - 4 digit year of birth
    StructField("dtaddto", StringType(), True),   # DTADDTO - Character Date Field - Date to which admitted to U.S. (allowed to stay until) - CIC does not use
    StructField("gender", StringType(), True),    # GENDER - Non-immigrant sex
    StructField("insnum", StringType(), True),    # INSNUM - INS number
    StructField("airline", StringType(), True),   # AIRLINE - Airline used to arrive in U.S.
    StructField("admnum", DoubleType(), True),    # ADMNUM - Admission Number
    StructField("fltno", StringType(), True),     # FLTNO - Flight number of Airline used to arrive in U.S.
    StructField("visatype", StringType(), True),  # VISATYPE - Class of admission legally admitting the non-immigrant to temporarily stay in U.S.
])        

In [35]:
# read spark with schema definition
input_data = '../../staging/i94/i94_jun16_sub.sas7bdat'
try:
    df_spark = spark.read.format('com.github.saurfang.sas.spark').load(input_data , schema=i94_schema) # , mode='DROPMALFORMED')
except Exception:
    print("Error Reading the file.")
else:
    print("Alternative Read")
    df_spark_no_schema = spark.read.format('com.github.saurfang.sas.spark').load(input_data)
    df_spark_no_schema.createOrReplaceTempView("i94_no_schema")
    
    df_spark = \
        spark.sql("""select
                         int(cicid) 
                        ,int(i94yr)
                        ,int(i94mon)
                        ,int(i94cit)
                        ,int(i94res)
                        ,string(i94port)
                        ,int(arrdate)
                        ,int(i94mode)
                        ,string(i94addr)
                        ,int(depdate)
                        ,int(i94bir)
                        ,int(i94visa)
                        ,int(count)
                        ,string(dtadfile)
                        ,string(visapost)
                        ,string(occup)
                        ,string(entdepa)
                        ,string(entdepd)
                        ,string(entdepu)
                        ,string(matflag)
                        ,int(biryear)
                        ,string(dtaddto)
                        ,string(gender)
                        ,string(insnum)
                        ,string(airline)
                        ,double(admnum)
                        ,string(fltno)
                        ,string(visatype)
                    from i94_no_schema
                """)

print(df_spark.count())

Alternative Read
3574989


In [36]:
df_spark.printSchema()


root
 |-- cicid: integer (nullable = true)
 |-- i94yr: integer (nullable = true)
 |-- i94mon: integer (nullable = true)
 |-- i94cit: integer (nullable = true)
 |-- i94res: integer (nullable = true)
 |-- i94port: string (nullable = true)
 |-- arrdate: integer (nullable = true)
 |-- i94mode: integer (nullable = true)
 |-- i94addr: string (nullable = true)
 |-- depdate: integer (nullable = true)
 |-- i94bir: integer (nullable = true)
 |-- i94visa: integer (nullable = true)
 |-- count: integer (nullable = true)
 |-- dtadfile: string (nullable = true)
 |-- visapost: string (nullable = true)
 |-- occup: string (nullable = true)
 |-- entdepa: string (nullable = true)
 |-- entdepd: string (nullable = true)
 |-- entdepu: string (nullable = true)
 |-- matflag: string (nullable = true)
 |-- biryear: integer (nullable = true)
 |-- dtaddto: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- insnum: string (nullable = true)
 |-- airline: string (nullable = true)
 |-- admnum: double 

In [37]:
df_spark.show(12)

+-----+-----+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+---------------+-----+--------+
|cicid|i94yr|i94mon|i94cit|i94res|i94port|arrdate|i94mode|i94addr|depdate|i94bir|i94visa|count|dtadfile|visapost|occup|entdepa|entdepd|entdepu|matflag|biryear| dtaddto|gender|insnum|airline|         admnum|fltno|visatype|
+-----+-----+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+---------------+-----+--------+
|    4| 2016|     6|   135|   135|    XXX|  20612|   null|   null|   null|    59|      2|    1|    null|    null| null|      Z|   null|      U|   null|   1957|10032016|  null|  null|   null|1.4938462027E10| null|      WT|
|    5| 2016|     6|   135|   135|    XXX|  20612|   null|   null|   null|    50|      2|    1|    null|    null