# Project 08 - Analysis of U.S. Immigration (I-94) Data
### Udacity Data Engineer - Capstone Project
> by Peter Wissel | 2021-05-05

## Project Overview
This project works with a data set for immigration to the United States. The supplementary datasets will include data on
airport codes, U.S. city demographics and temperature data.

The following process is divided into different sub-steps to illustrate how to answer the questions set by the business
analytics team.

The project follows the following steps:
* Step 4: Run ETL to Model the Data



#### 4.3 Data dictionary
Create a data dictionary for your data model. For each field, provide a brief description of what the data is and where
it came from. You can include the data dictionary in the notebook or in a separate file.

Generate a dictionary from table columns to fill out manually the meaning of the current column like the following example



In [1]:
###### Imports and Installs section
import shutil
from pathlib import Path
import pandas as pd
import pyspark.sql.functions as F
# import spark as spark
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, LongType, TimestampType, DateType
from datetime import datetime, timedelta
from pyspark.sql import SparkSession, DataFrameNaFunctions
from pyspark.sql.functions import when, count, col, to_date, datediff, date_format, month
import re
import json
from os import path


MAX_MEMORY = "5g"

spark = SparkSession\
    .builder\
    .appName("etl pipeline for project 8 - I94 data") \
    .config("spark.jars.packages","saurfang:spark-sas7bdat:3.0.0-s_2.12")\
    .config('spark.sql.repl.eagerEval.enabled', True) \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .appName("Foo") \
    .enableHiveSupport()\
    .getOrCreate()

# setting the current LOG-Level
spark.sparkContext.setLogLevel('ERROR')

In [2]:
# create data dictionary of used star schema
def create_data_dictionary_from_df(location_to_read):
    json_table_data = {}
    tables = {}

    # loop thru list of data frames to read (star schema)
    for current_table_name_from_df in locations_to_read:

        location_to_read = current_table_name_from_df
        regex = r"^.*\/(\w+)$"
        matches = re.finditer(regex, location_to_read, re.MULTILINE)

        # get table name from location string of source data
        for matchNum, match in enumerate(matches, start=1):
            current_table = match.group(matchNum)

        # Set current table name. Table description will be filled in later. Columns will be appended later also.
        dict_current_table = {"table_name": current_table,
                              "table_description": "not set"}

        # read all table columns for current table from df
        current_table_columns_df = [spark.read.parquet(location_to_read).columns]

        # create dictionary from table columns
        current_table_columns_dict = {}

        # loop thru list "current_table_columns_df" and add columns to dict "current_table_columns_dict"
        for counter, current_table_columns_df_column in enumerate(current_table_columns_df, start=1):
            for current_column in enumerate(current_table_columns_df_column, start=1):
                current_table_column_name_dict = {"column_name": current_column[counter],
                                                  "column_description": "not set"}
                current_table_columns_dict[current_column[counter]] = current_table_column_name_dict

        dict_current_table["columns"] = current_table_columns_dict

        tables[current_table] = dict_current_table

        # add tables content to the dict json_data
        json_table_data["tables"] = tables

    return json_table_data

In [3]:
# add table and column descriptions manually
def update_descriptions(json_data_dictionary):

    # The following part is specific to this Project. Every description has to be configured separately

    # Table d_immigration_countries
    json_data_dictionary['tables']['d_immigration_countries']['table_description'] = \
        "Country where immigrants come from to the U.S."
    json_data_dictionary['tables']['d_immigration_countries']['columns']['d_ic_id']['column_description'] \
        = "PK of table d_immigration_countries"
    json_data_dictionary['tables']['d_immigration_countries']['columns']['d_ic_country_code']['column_description'] \
        = "Abbreviation of country code"
    json_data_dictionary['tables']['d_immigration_countries']['columns']['d_ic_country_name']['column_description'] \
        = "Name of country"

    # Table d_immigration_airports
    json_data_dictionary['tables']['d_immigration_airports']['table_description'] \
        = "Airport name where foreign people arrive to the U.S. "
    json_data_dictionary['tables']['d_immigration_airports']['columns']['d_ia_id']['column_description'] \
        = "PK of table d_immigration_airports"
    json_data_dictionary['tables']['d_immigration_airports']['columns']['d_ia_airport_code']['column_description'] \
        = "Abbreviation code of Airport"
    json_data_dictionary['tables']['d_immigration_airports']['columns']['d_ia_airport_name']['column_description'] \
        = "Name of Airport"
    json_data_dictionary['tables']['d_immigration_airports']['columns']['d_ia_airport_state_code']['column_description'] \
        = "Abbreviation of state where Airport is located"

    # Table d_date_arrivals
    json_data_dictionary['tables']['d_date_arrivals']['table_description'] \
        = "Arrival date for foreign persons to immigrate to the U.S.? "
    json_data_dictionary['tables']['d_date_arrivals']['columns']['d_da_id']['column_description'] \
        = "PK of table d_date_arrivals"
    json_data_dictionary['tables']['d_date_arrivals']['columns']['d_da_date']['column_description'] \
        = "Date when foreign persons arrive for immigration to the U.S. "
    json_data_dictionary['tables']['d_date_arrivals']['columns']['d_da_year']['column_description'] \
        = "Year of arrival like '2020'"
    json_data_dictionary['tables']['d_date_arrivals']['columns']['d_da_year_quarter']['column_description'] \
        = "Year and quarter of arrival like '2016/1'"
    json_data_dictionary['tables']['d_date_arrivals']['columns']['d_da_year_month']['column_description'] \
        = "Year and month of arrival like '2016/01'"
    json_data_dictionary['tables']['d_date_arrivals']['columns']['d_da_quarter']['column_description'] \
        = "Quarter of arrival like '1'"
    json_data_dictionary['tables']['d_date_arrivals']['columns']['d_da_month']['column_description'] \
        = "Month of arrival like '1'"
    json_data_dictionary['tables']['d_date_arrivals']['columns']['d_da_week']['column_description'] \
        = "Week of arrival like '53'"
    json_data_dictionary['tables']['d_date_arrivals']['columns']['d_da_weekday']['column_description'] \
        = "Day of week like 'Friday'"
    json_data_dictionary['tables']['d_date_arrivals']['columns']['d_da_weekday_short']['column_description'] \
        = "Day of week in short form like 'Fri'"
    json_data_dictionary['tables']['d_date_arrivals']['columns']['d_da_dayofweek']['column_description'] \
        = "Day of week as number like '6'"
    json_data_dictionary['tables']['d_date_arrivals']['columns']['d_da_day']['column_description'] \
        = "Day number of current date like 2016-01-01 --> 1"

    # Table d_date_departures
    json_data_dictionary['tables']['d_date_departures']['table_description'] = "Departure date from USA"
    json_data_dictionary['tables']['d_date_departures']['columns']['d_dd_id']['column_description'] \
        = "PK of table d_date_departures"
    json_data_dictionary['tables']['d_date_departures']['columns']['d_dd_date']['column_description'] \
        = "Date when foreign persons departure for immigration to the U.S. "
    json_data_dictionary['tables']['d_date_departures']['columns']['d_dd_year']['column_description'] \
        = "Year of departure like '2020'"
    json_data_dictionary['tables']['d_date_departures']['columns']['d_dd_year_quarter']['column_description'] \
        = "Year and quarter of departure like '2016/1'"
    json_data_dictionary['tables']['d_date_departures']['columns']['d_dd_year_month']['column_description'] \
        = "Year and quarter of departure like '2016/1'"
    json_data_dictionary['tables']['d_date_departures']['columns']['d_dd_quarter']['column_description'] \
        = "Quarter of departure like '1'"
    json_data_dictionary['tables']['d_date_departures']['columns']['d_dd_month']['column_description'] \
        = "Month of departure like '1'"
    json_data_dictionary['tables']['d_date_departures']['columns']['d_dd_week']['column_description'] \
        = "Week of departure like '53'"
    json_data_dictionary['tables']['d_date_departures']['columns']['d_dd_weekday']['column_description'] \
        = "Day of week like 'Friday'"
    json_data_dictionary['tables']['d_date_departures']['columns']['d_dd_weekday_short']['column_description'] \
        = "Day of week in short form like 'Fri'"
    json_data_dictionary['tables']['d_date_departures']['columns']['d_dd_dayofweek']['column_description'] \
        = "Day of week as number like '6'"
    json_data_dictionary['tables']['d_date_departures']['columns']['d_dd_day']['column_description'] \
        = "Day number of current date like 2016-01-01 --> 1"


    # Table d_state_destinations --> To which states in the U.S. do immigrants want to continue their travel after
    # their initial arrival and what demographics can immigrants expect when they arrive in the destination state, such
    # as average temperature, population numbers or population density?
    json_data_dictionary['tables']['d_state_destinations']['table_description'] \
        = "To which State immigrants want to continue their travel after initial arrival in the U.S."
    json_data_dictionary['tables']['d_state_destinations']['columns']['d_sd_id']['column_description'] \
        = "PK of table d_state_destinations"
    json_data_dictionary['tables']['d_state_destinations']['columns']['d_sd_state_code']['column_description'] \
        = "Abbreviation of State code"
    json_data_dictionary['tables']['d_state_destinations']['columns']['d_sd_state_name']['column_description'] \
        = "Full name of State"
    json_data_dictionary['tables']['d_state_destinations']['columns']['d_sd_age_median']['column_description'] \
        = "Median age of the population"
    json_data_dictionary['tables']['d_state_destinations']['columns']['d_sd_population_male']['column_description'] \
        = "Average of male population"
    json_data_dictionary['tables']['d_state_destinations']['columns']['d_sd_population_female']['column_description'] \
        = "Average of female population"
    json_data_dictionary['tables']['d_state_destinations']['columns']['d_sd_population_total']['column_description'] \
        = "Average of population"
    json_data_dictionary['tables']['d_state_destinations']['columns']['d_sd_foreign_born']['column_description'] \
        = "Average of the population born abroad"

    # Table f_i94_immigrations
    json_data_dictionary['tables']['f_i94_immigrations']['table_description'] = "I-94 Immigration data to the U.S."
    json_data_dictionary['tables']['f_i94_immigrations']['columns']['f_i94_id']['column_description'] \
        = "PK of table f_i94_immigrations"
    json_data_dictionary['tables']['f_i94_immigrations']['columns']['d_ia_id']['column_description'] \
        = "FK of table d_immigration_airports"
    json_data_dictionary['tables']['f_i94_immigrations']['columns']['d_sd_id']['column_description'] \
        = "FK of table d_state_destinations"
    json_data_dictionary['tables']['f_i94_immigrations']['columns']['d_da_id']['column_description'] \
        = "FK of table d_date_arrivals"
    json_data_dictionary['tables']['f_i94_immigrations']['columns']['d_dd_id']['column_description'] \
        = "FK of table d_date_departures"
    json_data_dictionary['tables']['f_i94_immigrations']['columns']['d_ic_id']['column_description'] \
        = "FK of table d_immigration_countries"
    json_data_dictionary['tables']['f_i94_immigrations']['columns']['f_i94_cit']['column_description'] \
        = "Country where the immigrants come from"
    json_data_dictionary['tables']['f_i94_immigrations']['columns']['f_i94_port']['column_description'] \
        = "Arrival airport from immigrants to the U.S."
    json_data_dictionary['tables']['f_i94_immigrations']['columns']['f_i94_addr']['column_description'] \
        = "Location State where the immigrants want travel to"
    json_data_dictionary['tables']['f_i94_immigrations']['columns']['f_i94_arrdate_iso']['column_description'] \
        = "Arrival date in the U.S."
    json_data_dictionary['tables']['f_i94_immigrations']['columns']['f_i94_depdate_iso']['column_description'] \
        = "Departure date from U.S."
    json_data_dictionary['tables']['f_i94_immigrations']['columns']['f_i94_dtadfile']['column_description'] \
        = "Date added to I-94 Files"
    json_data_dictionary['tables']['f_i94_immigrations']['columns']['f_i94_matflag']['column_description'] \
        = "Match flag - Match of arrival and departure records"
    json_data_dictionary['tables']['f_i94_immigrations']['columns']['f_i94_count']['column_description'] \
        = "Counter (1). This value is used for calculation purposes"
    json_data_dictionary['tables']['f_i94_immigrations']['columns']['f_i94_year']['column_description'] \
        = "4 digit year when record added to I-94 Files"
    json_data_dictionary['tables']['f_i94_immigrations']['columns']['f_i94_month']['column_description'] \
        = "Month when record added to I-94 Files"
    json_data_dictionary['tables']['f_i94_immigrations']['columns']['f_i94_port_state_code']['column_description'] \
        = "State code of state where immigration airport (I94PORT) is located"

    return json_data_dictionary

In [4]:
def persist_json_data(json_data, location_to_write):
    # write data to file in json format
    with open(location_to_write, "w") as outfile:
        json.dump(json_data, outfile, sort_keys=True, indent=4, ensure_ascii=False)

# File locations to get all columns of the source data to be described.
locations_to_read = [
    str("../P8_capstone_resource_files/parquet_star/PQ1/d_immigration_countries")
    , str("../P8_capstone_resource_files/parquet_star/PQ2/d_immigration_airports")
    , str("../P8_capstone_resource_files/parquet_star/PQ3/d_date_arrivals")
    , str("../P8_capstone_resource_files/parquet_star/PQ3/d_date_departures")
    , str("../P8_capstone_resource_files/parquet_star/PQ4/d_state_destinations")
    , str("../P8_capstone_resource_files/parquet_star/PQ4/f_i94_immigrations")
]

In [5]:
def main():
    # create automatically a data dictionary based on the loaded tables (data frames)
    json_data = create_data_dictionary_from_df(locations_to_read)

    # add descriptions to data dictionary for tables and table columns
    json_data = update_descriptions(json_data)

    #check if destination folder exists
    if not path.exists("../P8_capstone_resource_files/data_dictionary"):
        Path("../P8_capstone_resource_files/data_dictionary").mkdir(parents=True, exist_ok=True)

    # persist generated json_data to disk
    location_to_write = "../P8_capstone_resource_files/data_dictionary/P8_capstone_documentation_data_dictionary.json"
    persist_json_data(json_data, location_to_write)
    print("Creation of data dictionary finished")


if __name__ == "__main__":
    main()

Creation of data dictionary finished
