# Exploratory Analysis for I94 immigration data
- Exploratory Analysis with SPARK
- Schema definition
- Reading from SAS and writing to parquet

In [1]:
from datetime import datetime, timedelta

import configparser
import os
from pprint import pprint

import pandas as pd, numpy as np
import matplotlib.pyplot as plt

import findspark
findspark.init()
print(findspark.find())
print(os.environ['SPARK_HOME'])
print(os.environ['JAVA_HOME'])
print(os.environ['HADOOP_HOME'])

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.functions import col, asc, desc, min, max
from pyspark.sql.types import *

pd.set_option('display.max_rows', 50)

c:\spark
c:\spark
C:\Program Files\Zulu\zulu-8-jre\
c:\Hadoop


In [2]:
def create_spark_session(local=True):
    """
    Creates and returns spark session.
    """
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "saurfang:spark-sas7bdat:3.0.0-s_2.12") \
        .enableHiveSupport() \
        .getOrCreate()

    return spark

In [3]:
# create spark session
spark = create_spark_session()

In [4]:
# read spark - infer schema
df_spark = spark.read.format('com.github.saurfang.sas.spark').load('./data/i94_apr16_sub.sas7bdat')
print(df_spark.count())

3096313


In [5]:
df_spark.printSchema()

root
 |-- cicid: double (nullable = true)
 |-- i94yr: double (nullable = true)
 |-- i94mon: double (nullable = true)
 |-- i94cit: double (nullable = true)
 |-- i94res: double (nullable = true)
 |-- i94port: string (nullable = true)
 |-- arrdate: double (nullable = true)
 |-- i94mode: double (nullable = true)
 |-- i94addr: string (nullable = true)
 |-- depdate: double (nullable = true)
 |-- i94bir: double (nullable = true)
 |-- i94visa: double (nullable = true)
 |-- count: double (nullable = true)
 |-- dtadfile: string (nullable = true)
 |-- visapost: string (nullable = true)
 |-- occup: string (nullable = true)
 |-- entdepa: string (nullable = true)
 |-- entdepd: string (nullable = true)
 |-- entdepu: string (nullable = true)
 |-- matflag: string (nullable = true)
 |-- biryear: double (nullable = true)
 |-- dtaddto: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- insnum: string (nullable = true)
 |-- airline: string (nullable = true)
 |-- admnum: double (nullable = 

In [6]:
# define schema
# https://knowledge.udacity.com/questions/316417 - I94cit and I94 res clarification

i94_schema = StructType([
    StructField("cicid", IntegerType(), True),    # id
    StructField("i94yr", IntegerType(), True),    # Year
    StructField("i94mon", IntegerType(), True),   # Month
    StructField("i94cit", IntegerType(), True),   # Country Codes I94CIT represents the country of citizenship.
    StructField("i94res", IntegerType(), True),   # Country Codes I94RES represents the country of residence.
    StructField("i94port", StringType(), True),   # e. g. 'DTH'	=	'DUTCH HARBOR, AK  
    StructField("arrdate", IntegerType(), True),  # ARRDATE is the Arrival Date in the USA. SAS date numeric field
    StructField("i94mode", IntegerType(), True),  # Air, Sea, Land ...
    StructField("i94addr", StringType(), True),   # States: FL, ...
    StructField("depdate", IntegerType(), True),  # SAS date numeric field 
    StructField("i94bir", IntegerType(), True),   # Age of Respondent in Years
    StructField("i94visa", IntegerType(), True),  # Business, Pleasure, Student
    StructField("count", IntegerType(), True),    # COUNT - Used for summary statistics
    StructField("dtadfile", StringType(), True),  # DTADFILE - Character Date Field - Date added to I-94 Files - CIC does not use
    StructField("visapost", StringType(), True),  # VISAPOST - Department of State where where Visa was issued - CIC does not use
    StructField("occup", StringType(), True),     # OCCUP - Occupation that will be performed in U.S. - CIC does not use
    StructField("entdepa", StringType(), True),   # ENTDEPA - Arrival Flag - admitted or paroled into the U.S. - CIC does not use
    StructField("entdepd", StringType(), True),   # ENTDEPD - Departure Flag - Departed, lost I-94 or is deceased - CIC does not use
    StructField("entdepu", StringType(), True),   # ENTDEPU - Update Flag - Either apprehended, overstayed, adjusted to perm residence - CIC does not use
    StructField("matflag", StringType(), True),   # MATFLAG - Match flag - Match of arrival and departure records
    StructField("biryear", IntegerType(), True),  # BIRYEAR - 4 digit year of birth
    StructField("dtaddto", StringType(), True),   # DTADDTO - Character Date Field - Date to which admitted to U.S. (allowed to stay until) - CIC does not use
    StructField("gender", StringType(), True),    # GENDER - Non-immigrant sex
    StructField("insnum", StringType(), True),    # INSNUM - INS number
    StructField("airline", StringType(), True),   # AIRLINE - Airline used to arrive in U.S.
    StructField("admnum", DoubleType(), True),    # ADMNUM - Admission Number
    StructField("fltno", StringType(), True),     # FLTNO - Flight number of Airline used to arrive in U.S.
    StructField("visatype", StringType(), True),  # VISATYPE - Class of admission legally admitting the non-immigrant to temporarily stay in U.S.
])        

In [7]:
# additional column clarifications
# https://knowledge.udacity.com/questions/297018

#cicid (int) : This is the unique Identifier --> Primary key
#i94yr (int) : 4 digit year of the arrival
#i94mon(int) : numeric month of the arrival
#i94cit(string) : 3 digit code of origin city
#i94port(string) : 3 character code of destination city --> Foreign key (used to map to USDemographics and City Temperature data)
#arrdate (float) : arrival date of the departure
#i94mode (float): 1 digit travel code
#depdate (float) = departure date
#i94visa (string): reason for immigration
#visatype (string): Visa type(Student, Job, etc.)
#Arrival_date (Datetime): arrdate in datetime object.Used to map to Date dimensions table ---> Foreign key

In [8]:
# read spark with schema definition
df_spark = spark.read.format('com.github.saurfang.sas.spark').load('./data/i94_apr16_sub.sas7bdat', schema=i94_schema)
print(df_spark.count())

3096313


In [9]:
#https://knowledge.udacity.com/questions/66798

# convert SAS date to date
def convert_sas_date(x):
    try:
        start = datetime(1960, 1, 1)
        return start + timedelta(days=int(x))
    except:
        return None
    
# register udf
udf_date_from_sas = udf(lambda x: convert_sas_date(x), DateType())

In [10]:
# convert string format to date

def convert_str_to_date(x):
    try:
        return datetime.strptime(x, "%Y%m%d")
    except:
        return None

# register udf
udf_date_from_str = udf(lambda x: convert_str_to_date(x), DateType())

In [11]:
datetime.strptime("20130811", "%Y%m%d")

datetime.datetime(2013, 8, 11, 0, 0)

In [12]:
# add date columns
df_spark = df_spark\
    .withColumn("arrival_date", udf_date_from_sas("arrdate")) \
    .withColumn("departure_date", udf_date_from_sas("depdate")) \
    .withColumn("dtadfile_date", udf_date_from_str("dtadfile"))

In [13]:
df_spark.show(5)

+-----+-----+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+--------------+-----+--------+------------+--------------+-------------+
|cicid|i94yr|i94mon|i94cit|i94res|i94port|arrdate|i94mode|i94addr|depdate|i94bir|i94visa|count|dtadfile|visapost|occup|entdepa|entdepd|entdepu|matflag|biryear| dtaddto|gender|insnum|airline|        admnum|fltno|visatype|arrival_date|departure_date|dtadfile_date|
+-----+-----+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+--------------+-----+--------+------------+--------------+-------------+
|    6| 2016|     4|   692|   692|    XXX|  20573|   null|   null|   null|    37|      2|    1|    null|    null| null|      T|   null|      U|   null|   1979|10282016|  null|  null|   null| 1.897628485E9| null|

In [14]:
# check if cicid is a unique id
print(df_spark.select("cicid").dropDuplicates().count())
print(df_spark.count())

3096313
3096313


In [15]:
# check uniqunes
df_spark.select("i94yr").dropDuplicates().show()

+-----+
|i94yr|
+-----+
| 2016|
+-----+



In [16]:
# check uniqunes
df_spark.select("i94mon").dropDuplicates().show()

+------+
|i94mon|
+------+
|     4|
+------+



In [17]:
# check frequency
df_spark.groupby("i94cit").count().sort(desc("count")).show()

+------+------+
|i94cit| count|
+------+------+
|   135|360157|
|   209|206873|
|   245|191425|
|   111|188766|
|   582|175781|
|   148|157806|
|   254|137735|
|   689|129833|
|   213|110691|
|   438|109884|
|   117| 78535|
|   123| 76920|
|   687| 69853|
|   129| 57224|
|   691| 54120|
|   130| 45269|
|   251| 41744|
|   692| 41349|
|   252| 41132|
|   696| 40785|
+------+------+
only showing top 20 rows



In [18]:
# check frequency
df_spark.groupby("i94port").count().sort(desc("count")).show()

+-------+------+
|i94port| count|
+-------+------+
|    NYC|485916|
|    MIA|343941|
|    LOS|310163|
|    SFR|152586|
|    ORL|149195|
|    HHW|142720|
|    NEW|136122|
|    CHI|130564|
|    HOU|101481|
|    FTL| 95977|
|    ATL| 92579|
|    LVG| 89280|
|    AGA| 80919|
|    WAS| 74835|
|    DAL| 71809|
|    BOS| 57354|
|    SEA| 47719|
|    PHO| 38890|
|    DET| 37832|
|    TAM| 25632|
+-------+------+
only showing top 20 rows



In [20]:
# check min and max
df_spark.selectExpr("min(arrival_date)", "max(arrival_date)").show()

+-----------------+-----------------+
|min(arrival_date)|max(arrival_date)|
+-----------------+-----------------+
|       2016-04-01|       2016-04-30|
+-----------------+-----------------+



In [21]:
# check frequency
df_spark.groupby("i94mode").count().sort(desc("count")).show()

+-------+-------+
|i94mode|  count|
+-------+-------+
|      1|2994505|
|      3|  66660|
|      2|  26349|
|      9|   8560|
|   null|    239|
+-------+-------+



- add text mapping directly

In [22]:
# check frequency
df_spark.groupby("i94addr").count().sort(desc("count")).show()

+-------+------+
|i94addr| count|
+-------+------+
|     FL|621701|
|     NY|553677|
|     CA|470386|
|     HI|168764|
|   null|152592|
|     TX|134321|
|     NV|114609|
|     GU| 94107|
|     IL| 82126|
|     NJ| 76531|
|     MA| 70486|
|     WA| 55792|
|     GA| 44663|
|     MI| 32101|
|     VA| 31399|
|     PA| 30293|
|     DC| 28228|
|     NE| 26574|
|     MD| 25360|
|     NC| 23375|
+-------+------+
only showing top 20 rows



In [23]:
# check outliers
df_spark.describe("i94bir").show()

+-------+------------------+
|summary|            i94bir|
+-------+------------------+
|  count|           3095511|
|   mean|41.767614458485205|
| stddev|17.420260534588213|
|    min|                -3|
|    max|               114|
+-------+------------------+



- negative values

In [24]:
# check frequency
df_spark.groupby("i94visa").count().sort(desc("count")).show()

+-------+-------+
|i94visa|  count|
+-------+-------+
|      2|2530868|
|      1| 522079|
|      3|  43366|
+-------+-------+



In [25]:
df_spark.groupby("i94addr").count().sort(desc("count")).show()

+-------+------+
|i94addr| count|
+-------+------+
|     FL|621701|
|     NY|553677|
|     CA|470386|
|     HI|168764|
|   null|152592|
|     TX|134321|
|     NV|114609|
|     GU| 94107|
|     IL| 82126|
|     NJ| 76531|
|     MA| 70486|
|     WA| 55792|
|     GA| 44663|
|     MI| 32101|
|     VA| 31399|
|     PA| 30293|
|     DC| 28228|
|     NE| 26574|
|     MD| 25360|
|     NC| 23375|
+-------+------+
only showing top 20 rows



In [26]:
df_spark.select("occup").where(col("occup").isNotNull()).show(10)

+-----+
|occup|
+-----+
|  ELT|
|  PHS|
|  EXA|
|  EXA|
|  EXA|
|  STU|
|  STU|
|  STU|
|  MKT|
|  STU|
+-----+
only showing top 10 rows



In [27]:
df_spark.select("occup").where(col("occup").isNotNull()).dropDuplicates().count()

111

- no detailed description available

In [28]:
df_spark.groupby("gender").count().sort(desc("count")).show()

+------+-------+
|gender|  count|
+------+-------+
|     M|1377224|
|     F|1302743|
|  null| 414269|
|     X|   1610|
|     U|    467|
+------+-------+



- X could be intersex or other
- U could be Unknown
- since it is not clear X will be combined with U for Unknown, Null will also be labeled as Unknown

**Resources**
- https://www.federalregister.gov/documents/2013/03/27/2013-06974/definition-of-form-i-94-to-include-electronic-format
- https://www.themandarin.com.au/83991-what-is-gender-x/




In [29]:
df_spark.groupby("airline").count().sort(desc("count")).show()

+-------+------+
|airline| count|
+-------+------+
|     AA|310091|
|     UA|264271|
|     DL|252526|
|     BA|190997|
|     LH|120556|
|     VS|113384|
|   null| 83627|
|     AF| 81113|
|     KE| 71047|
|     JL| 69075|
|     AM| 60307|
|     EK| 55800|
|     CM| 49990|
|     B6| 49265|
|     AV| 48921|
|     JJ| 46277|
|     LA| 43111|
|     QF| 41945|
|     NH| 40665|
|     KL| 39978|
+-------+------+
only showing top 20 rows



In [30]:
df_spark.groupby("visatype").count().sort(desc("count")).show()

+--------+-------+
|visatype|  count|
+--------+-------+
|      WT|1309059|
|      B2|1117897|
|      WB| 282983|
|      B1| 212410|
|     GMT|  89133|
|      F1|  39016|
|      E2|  19383|
|      CP|  14758|
|      E1|   3743|
|       I|   3176|
|      F2|   2984|
|      M1|   1317|
|      I1|    234|
|     GMB|    150|
|      M2|     49|
|     SBP|     11|
|     CPL|     10|
+--------+-------+



- enrichment possible, get speaking description
- https://travel.state.gov/content/travel/en/us-visas/visa-information-resources/all-visa-categories.html

In [31]:
# check for duplicates
print(df_spark.select("admnum").dropDuplicates().count())
print(df_spark.select("admnum").count())

3075579
3096313


In [32]:
df_spark.select("admnum").groupBy("admnum").count().sort(desc("count")).limit(10).show()

+---------------+-----+
|         admnum|count|
+---------------+-----+
|            0.0|   68|
| 7.812054623E10|   11|
| 8.924999063E10|    9|
| 4.652077483E10|    9|
| 4.701040863E10|    8|
| 8.904084763E10|    8|
| 8.989482993E10|    7|
|5.6036428333E10|    7|
| 8.581902733E10|    7|
| 3.697806763E10|    7|
+---------------+-----+



In [34]:
df_spark.where(col("admnum") == 7.812054623E10).sort(asc("arrival_date")).show()

+-------+-----+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+--------------+-----+--------+------------+--------------+-------------+
|  cicid|i94yr|i94mon|i94cit|i94res|i94port|arrdate|i94mode|i94addr|depdate|i94bir|i94visa|count|dtadfile|visapost|occup|entdepa|entdepd|entdepu|matflag|biryear| dtaddto|gender|insnum|airline|        admnum|fltno|visatype|arrival_date|departure_date|dtadfile_date|
+-------+-----+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+--------+------+------+-------+--------------+-----+--------+------------+--------------+-------------+
|  60160| 2016|     4|   209|   209|    ANZ|  20545|      3|   null|  20548|    30|      1|    1|20160401|     KBO| null|      Z|      I|   null|      M|   1986|10302017|     M|  null|   null|7.812054623E1

- could be the same person

In [5]:
# check entire dataset for duplicates (exclude primary key)
cols = df_spark.columns[1:]
if df_spark.count() > df_spark.dropDuplicates(cols).count():
    raise ValueError('Data has duplicates')

- no duplicates found!

In [6]:
def check_nulls(df, column):
    null_count =  df.select(column).where(col(column).isNull()).count()
    print(f"{null_count} for {column}")

In [7]:
for x in cols:
    check_nulls(df_spark, x)

0 for i94yr
0 for i94mon
0 for i94cit
0 for i94res
0 for i94port
0 for arrdate
239 for i94mode
152592 for i94addr
142457 for depdate
802 for i94bir
0 for i94visa
0 for count
1 for dtadfile
1881250 for visapost
3088187 for occup
238 for entdepa
138429 for entdepd
3095921 for entdepu
138429 for matflag
802 for biryear
477 for dtaddto
414269 for gender
2982605 for insnum
83627 for airline
0 for admnum
19549 for fltno
0 for visatype


In [8]:
# write as parquet
#output_data = "../data/"
#df_spark.write.parquet(output_data+"i94.parquet", mode="overwrite", partitionBy=['i94yr', 'i94mon'] )