# Workbook contains the following topics:
- 1) Country Mapping / Dimension Table 
- 2) US-States Dimension Table

## 1) Country Mapping for I94CIT and I94RES (and Continent Countries Dimension Table)

- Convert I94CIT and I94RES numeric codes into 2-digit ISO-Country Code
- ISO-Country Code enables usage of continent hierarchy
- This is performed via a fuzzy matching (pycountry package) / punctual cleansing

In [1]:
import pandas as pd
import pycountry

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 300)

In [2]:
# read sas-file documentation 
# https://knowledge.udacity.com/questions/125439

with open('./data/I94_SAS_Labels_Descriptions.SAS') as f:
    f_content = f.read()
    f_content = f_content.replace('\t', '')
    

def code_mapper(file, idx):
    f_content2 = f_content[f_content.index(idx):]
    f_content2 = f_content2[:f_content2.index(';')].split('\n')
    f_content2 = [i.replace("'", "") for i in f_content2]
    dic = [i.split('=') for i in f_content2[1:]]
    dic = dict([i[0].strip(), i[1].strip()] for i in dic if len(i) == 2)
    return dic

In [3]:
# get mapping dictionary from file
mapper = code_mapper(f_content, "i94cntyl")

In [4]:
# get first element
mapper.get('582')

'MEXICO Air Sea, and Not Reported (I-94, no land arrivals)'

- key is a string must be converted when mapping with I94-dataset

In [5]:
# convert mapper dictionary to dataframe
mapper_df = pd.DataFrame([{"i94_code":k, "i94_desc":v} for k,v in mapper.items()])

In [6]:
# exlucde invalid elements
mapper_df = mapper_df[~mapper_df['i94_desc'].str.contains("No Country")]
mapper_df = mapper_df[~mapper_df['i94_desc'].str.contains("INVALID")]
mapper_df = mapper_df[~mapper_df['i94_desc'].str.contains("should not show")]

In [7]:
mapper_df.sample(5)

Unnamed: 0,i94_code,i94_desc
56,322,DJIBOUTI
69,110,FINLAND
182,348,SIERRA LEONE
90,528,HONDURAS
38,383,CENTRAL AFRICAN REPUBLIC


In [8]:
# perform fuzzy matching on country name to get 2-digit ISO Code
def get_country_code(country_name):
    try:
        country_code = pycountry.countries.search_fuzzy(country_name)[0].alpha_2
        return country_code
    except:
        #print(f"{country_name} not found")
        return "no match"

In [9]:
mapper_df['iso_code'] = mapper_df['i94_desc'].apply(get_country_code)

In [10]:
# check no-matches
mapper_df[mapper_df['iso_code'] == 'no match']

Unnamed: 0,i94_code,i94_desc,iso_code
0,582,"MEXICO Air Sea, and Not Reported (I-94, no lan...",no match
7,518,ANTIGUA-BARBUDA,no match
25,717,"BONAIRE, ST EUSTATIUS, SABA",no match
26,164,BOSNIA-HERZEGOVINA,no match
33,243,BURMA,no match
36,326,CAPE VERDE,no match
41,245,"CHINA, PRC",no match
44,271,COCOS ISLANDS,no match
54,723,FAROE ISLANDS (PART OF DENMARK),no match
59,240,EAST TIMOR,no match


In [11]:
# punctual correction of non matching values

no_match_df = mapper_df[mapper_df['iso_code'] =='no match'].copy()

no_match_df['i94_desc'] = no_match_df['i94_desc'].str.replace("ST.","SAINT")
no_match_df['i94_desc'] = no_match_df['i94_desc'].str.replace("SAINT KITTS-NEVIS","SAINT KITTS")
no_match_df['i94_desc'] = no_match_df['i94_desc'].str.replace("NORTH KOREA","KOREA, Democratic")
no_match_df['i94_desc'] = no_match_df['i94_desc'].str.replace("SOUTH KOREA","KOREA, Republic")
no_match_df['i94_desc'] = no_match_df['i94_desc'].str.replace("SAINT MAARTEN","MAARTEN")
no_match_df['i94_desc'] = no_match_df['i94_desc'].str.replace("SAINT VINCENT-GRENADINES","SAINT VINCENT")

# re-perform match
no_match_df['iso_code'] = no_match_df['i94_desc'].apply(get_country_code)

In [12]:
no_match_df

Unnamed: 0,i94_code,i94_desc,iso_code
0,582,"MEXICO Air Sea, and Not Reported (I-94, no lan...",no match
7,518,ANTIGUA-BARBUDA,no match
25,717,"BONAIRE, SAINTEUSAINTTIUS, SABA",no match
26,164,BOSNIA-HERZEGOVINA,no match
33,243,BURMA,no match
36,326,CAPE VERDE,no match
41,245,"CHINA, PRC",no match
44,271,COCOS ISLANDS,no match
54,723,FAROE ISLANDS (PART OF DENMARK),no match
59,240,EASAINTTIMOR,no match


In [13]:
# remove no match iso-codes from original dataframe
mapper_df = mapper_df[mapper_df['iso_code'] != 'no match']

# append new matches
mapper_df = mapper_df.append(no_match_df[no_match_df['iso_code'] != 'no match'])

# keep no matches for further fuzzy-search
no_match_df = no_match_df[no_match_df['iso_code'] == 'no match']

In [14]:
no_match_df

Unnamed: 0,i94_code,i94_desc,iso_code
0,582,"MEXICO Air Sea, and Not Reported (I-94, no lan...",no match
7,518,ANTIGUA-BARBUDA,no match
25,717,"BONAIRE, SAINTEUSAINTTIUS, SABA",no match
26,164,BOSNIA-HERZEGOVINA,no match
33,243,BURMA,no match
36,326,CAPE VERDE,no match
41,245,"CHINA, PRC",no match
44,271,COCOS ISLANDS,no match
54,723,FAROE ISLANDS (PART OF DENMARK),no match
59,240,EASAINTTIMOR,no match


In [15]:
# split i94_desc by space

no_match_df['i94_desc'] = no_match_df['i94_desc'].str.split(" ", expand=True)[0].str.split("-",expand=True)[0].str.replace(","," ").str.strip()

In [16]:
# apply fuzzy search

no_match_df['iso_code'] = no_match_df['i94_desc'].apply(get_country_code)

In [17]:
no_match_df[no_match_df['iso_code'] == "no match"]

Unnamed: 0,i94_code,i94_desc,iso_code
33,243,BURMA,no match
59,240,EASAINTTIMOR,no match
103,388,IVORY,no match
107,201,KAMPUCHEA,no match
114,203,LAOS,no match
123,214,MACAU,no match
202,351,SWAZILAND,no match
228,757,WESAINTINDIES,no match
230,465,WESAINTRN,no match
232,139,YUGOSLAVIA,no match


- ignore last non matching items (could be mapped with high manual effort e. g. BURMA -> Myanmar)

In [18]:
# final dataset - including

mapper_df

Unnamed: 0,i94_code,i94_desc,iso_code
1,236,AFGHANISTAN,AF
2,101,ALBANIA,AL
3,316,ALGERIA,DZ
4,102,ANDORRA,AD
5,324,ANGOLA,AO
6,529,ANGUILLA,AI
8,687,ARGENTINA,AR
9,151,ARMENIA,AM
10,532,ARUBA,AW
11,438,AUSTRALIA,AU


In [19]:
mapper_df[mapper_df['iso_code'] == 'RS']

Unnamed: 0,i94_code,i94_desc,iso_code
111,732,KOSOVO,RS
180,745,SERBIA,RS


In [20]:
mapper_df[mapper_df['iso_code'] == 'NA']

Unnamed: 0,i94_code,i94_desc,iso_code
144,371,NAMIBIA,


- check samples

In [21]:
mapper_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 208 entries, 1 to 199
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   i94_code  208 non-null    object
 1   i94_desc  208 non-null    object
 2   iso_code  208 non-null    object
dtypes: object(3)
memory usage: 6.5+ KB


- no null values

In [22]:
# export mapping list

mapper_df.to_csv("../staging/countries_mapping.csv", sep=";", index=False)

### Import Country - Continent Table (Dimension Table)

https://datahub.io/JohnSnowLabs/country-and-continent-codes-list#resource-country-and-continent-codes-list-csv

In [32]:
# import country table
df_con = pd.read_csv("./data/country-and-continent-codes-list-csv_csv.csv")
df_con.head()

Unnamed: 0,Continent_Name,Continent_Code,Country_Name,Two_Letter_Country_Code,Three_Letter_Country_Code,Country_Number
0,Asia,AS,"Afghanistan, Islamic Republic of",AF,AFG,4.0
1,Europe,EU,"Albania, Republic of",AL,ALB,8.0
2,Antarctica,AN,Antarctica (the territory South of 60 deg S),AQ,ATA,10.0
3,Africa,AF,"Algeria, People's Democratic Republic of",DZ,DZA,12.0
4,Oceania,OC,American Samoa,AS,ASM,16.0


In [33]:
df_con[df_con["Two_Letter_Country_Code"] == 'NA']

Unnamed: 0,Continent_Name,Continent_Code,Country_Name,Two_Letter_Country_Code,Three_Letter_Country_Code,Country_Number


In [34]:
# select relevant columns
df_con = df_con[['Two_Letter_Country_Code', 'Country_Name', 'Continent_Name']]

# rename columns
df_con.columns = ['country_id', 'country_name', 'continent_name']

In [35]:
df_con.head()

Unnamed: 0,country_id,country_name,continent_name
0,AF,"Afghanistan, Islamic Republic of",Asia
1,AL,"Albania, Republic of",Europe
2,AQ,Antarctica (the territory South of 60 deg S),Antarctica
3,DZ,"Algeria, People's Democratic Republic of",Africa
4,AS,American Samoa,Oceania


In [36]:
# merge dataframes to see matches

df_merge = pd.merge(mapper_df, df_con, how='left', left_on=['iso_code'] , right_on=['country_id'] )

In [37]:
df_merge

Unnamed: 0,i94_code,i94_desc,iso_code,country_id,country_name,continent_name
0,236,AFGHANISTAN,AF,AF,"Afghanistan, Islamic Republic of",Asia
1,101,ALBANIA,AL,AL,"Albania, Republic of",Europe
2,316,ALGERIA,DZ,DZ,"Algeria, People's Democratic Republic of",Africa
3,102,ANDORRA,AD,AD,"Andorra, Principality of",Europe
4,324,ANGOLA,AO,AO,"Angola, Republic of",Africa
5,529,ANGUILLA,AI,AI,Anguilla,North America
6,687,ARGENTINA,AR,AR,"Argentina, Argentine Republic",South America
7,151,ARMENIA,AM,AM,"Armenia, Republic of",Europe
8,151,ARMENIA,AM,AM,"Armenia, Republic of",Asia
9,532,ARUBA,AW,AW,Aruba,North America


In [38]:
df_merge[df_merge['continent_name'].isnull()]

Unnamed: 0,i94_code,i94_desc,iso_code,country_id,country_name,continent_name
129,371,NAMIBIA,,,,


- assign Namibia to Africa

In [39]:
df_merge.loc[129]['country_id'] = 'NA'
df_merge.loc[129]['country_name'] = 'Namibia'
df_merge.loc[129]['continent_name'] = 'Africa'

In [40]:
df_merge.loc[129]

i94_code              371
i94_desc          NAMIBIA
iso_code               NA
country_id             NA
country_name      Namibia
continent_name     Africa
Name: 129, dtype: object

In [42]:
# add dummy line for unknown continent
df_merge = pd.DataFrame.from_dict({'country_id':['99'],
                                   'country_name':['Unknown Country'],
                                   'continent_name':['Unknown Continent Name']
                                  }).append(df_merge).reset_index(drop=True)

In [44]:
df_merge[df_merge['country_id'] == '99']

Unnamed: 0,country_id,country_name,continent_name,i94_code,i94_desc,iso_code
0,99,Unknown Country,Unknown Continent Name,,,


In [49]:
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216 entries, 0 to 215
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   country_id      216 non-null    object
 1   country_name    216 non-null    object
 2   continent_name  216 non-null    object
dtypes: object(3)
memory usage: 5.2+ KB


- country continent dimension table looks good

In [50]:
df_merge = df_merge[['country_id','country_name','continent_name']].sort_values(by=["continent_name","country_name"])

In [51]:
df_merge

Unnamed: 0,country_id,country_name,continent_name
3,DZ,"Algeria, People's Democratic Republic of",Africa
5,AO,"Angola, Republic of",Africa
205,AO,"Angola, Republic of",Africa
21,BJ,"Benin, Republic of",Africa
26,BW,"Botswana, Republic of",Africa
31,BF,Burkina Faso,Africa
32,BI,"Burundi, Republic of",Africa
33,CM,"Cameroon, Republic of",Africa
35,CF,Central African Republic,Africa
36,TD,"Chad, Republic of",Africa


In [52]:
df_merge.to_csv("../staging/countries.csv", sep=";", index=False)

## 2) US-States Dimension Table

- create dimension table for i94addrl (1st address after arrival)

In [53]:
# get mapping dictionary from file
mapper = code_mapper(f_content, "i94addrl")

In [58]:
# convert mapper dictionary to dataframe
states_df = pd.DataFrame([{"state_id":k, "state_name":v} for k,v in mapper.items()])

In [59]:
states_df

Unnamed: 0,state_id,state_name
0,AL,ALABAMA
1,AK,ALASKA
2,AZ,ARIZONA
3,AR,ARKANSAS
4,CA,CALIFORNIA
5,CO,COLORADO
6,CT,CONNECTICUT
7,DE,DELAWARE
8,DC,DIST. OF COLUMBIA
9,FL,FLORIDA


In [60]:
states_df.to_csv("../staging/us_states.csv", sep=";", index=False)