In [1]:
import pandas as pd

In [2]:
# Source Files
airports_data = 'airport-codes_csv.csv'
us_cities_demographics_data = 'us-cities-demographics.csv'
immigration_sample_data = 'immigration_data_sample.csv' # sample dataset only
immigration_labels_description_data = 'I94_SAS_Labels_Descriptions.SAS'
# temperature_data = 'GlobalLandTemperaturesByCity.csv' # ../../data2/

----
### Airports Data (airports_data)
- This is a simple table of airport codes and corresponding cities.
- Source: https://datahub.io/core/airport-codes#data
- The airport codes may refer to either IATA airport code, a three-letter code which is used in passenger reservation, ticketing and baggage-handling systems, or the ICAO airport code which is a four letter code used by ATC systems and for airports that do not have an IATA airport code (from wikipedia).

In [3]:
df_airports = pd.read_csv(airports_data)
df_airports.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,"-91.254898, 35.6087"


<b><i>Data dictionary</i></b>

<table class="tg" align="middle">
  <tr>
    <th class="tg-0pky">Feature</th>
    <th class="tg-0pky">Description</th>
  </tr>
 <tr><td class="tg-0pky">ident</td><td class="tg-0pky">Unique identifier of every row</td>
 <tr><td class="tg-0pky">type</td><td class="tg-0pky">airport type</td>
 <tr><td class="tg-0pky">name</td><td class="tg-0pky">airport name</td>
 <tr><td class="tg-0pky">elevation_ft</td><td class="tg-0pky">airport altitude in ft</td>
 <tr><td class="tg-0pky">iso_country</td><td class="tg-0pky">country code in ISO format</td>
 <tr><td class="tg-0pky">iso_region</td><td class="tg-0pky">airports's location region code in ISO format</td>
 <tr><td class="tg-0pky">municipality</td><td class="tg-0pky">municipality where airport is located</td>
 <tr><td class="tg-0pky">gps_code</td><td class="tg-0pky">airport GPS code</td>
 <tr><td class="tg-0pky">iata_code</td><td class="tg-0pky">airport IATA code</td>
 <tr><td class="tg-0pky">local_code</td><td class="tg-0pky">airport ICAO code</td>
 <tr><td class="tg-0pky">coordinates</td><td class="tg-0pky">airport GPS coordinates</td>
</table>

In [6]:
df_airports.shape

(55075, 12)

##### Examples of queries:

In [7]:
# Top 3 countries with most airports
df_airports.groupby(by='iso_country').count()['ident'].sort_values(ascending=False)[:3]

iso_country
US    22757
BR     4334
CA     2784
Name: ident, dtype: int64

In [8]:
# Top 3 regions with most airports in US
df_airports[df_airports['iso_country'] == 'US'].groupby(by='iso_region').count()['ident'].sort_values(ascending=False)[:3]

iso_region
US-TX    2277
US-CA    1088
US-FL     967
Name: ident, dtype: int64

In [9]:
# Airport at highest altitude
highest_elevation = df_airports['elevation_ft'].max()
df_airports[df_airports['elevation_ft'] == highest_elevation][['name', 'iso_country', 'continent', 'elevation_ft']]

Unnamed: 0,name,iso_country,continent,elevation_ft
24028,Siachen Glacier AFS Airport,IN,AS,22000.0


---
### US Cities Demographics Data (us_cities_demographics_data)
- Simple table with demographic statistics of US cities.
- This dataset includes information about the population of US cities such as race, household size, median age and population size.
- Source: https://public.opendatasoft.com/explore/dataset/us-cities-demographics/export/ (OpenSoft)

In [11]:
df_us_cities_demographics = pd.read_csv(us_cities_demographics_data, delimiter=';')
df_us_cities_demographics.head()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,Hispanic or Latino,25924
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402


<b><i>Data dictionary</i></b>

<table class="tg" align="middle">
  <tr>
    <th class="tg-0pky">Feature</th>
    <th class="tg-0pky">Description</th>
  </tr>
 <tr><td class="tg-0pky">City</td><td class="tg-0pky">US city name</td>
 <tr><td class="tg-0pky">State</td><td class="tg-0pky">US State of the city</td>
 <tr><td class="tg-0pky">Median Age</td><td class="tg-0pky">Median population age</td>
 <tr><td class="tg-0pky">Male Population</td><td class="tg-0pky">Total number of males</td>
 <tr><td class="tg-0pky">Female Population</td><td class="tg-0pky">Total number of females</td>
 <tr><td class="tg-0pky">Total Population</td><td class="tg-0pky">Total population</td>
 <tr><td class="tg-0pky">Number of Veternas</td><td class="tg-0pky">Number of veterans living in the city</td>
 <tr><td class="tg-0pky">Foreign-born</td><td class="tg-0pky">Number of residents who were not born in the city</td>
 <tr><td class="tg-0pky">Average Household Size</td><td class="tg-0pky">Average household in the city</td>
 <tr><td class="tg-0pky">Race</td><td class="tg-0pky">Race class</td>
 <tr><td class="tg-0pky">Count</td><td class="tg-0pky">Number of individuals in each race</td>
</table>

In [12]:
df_us_cities_demographics.shape

(2891, 12)

##### Examples of queries:

In [13]:
# Top 3 States with most population
df_us_cities_demographics.groupby(by='State').sum()['Total Population'].sort_values(ascending=False)[:3]

State
California    123444353
Texas          70553853
New York       49002055
Name: Total Population, dtype: int64

In [14]:
# Average stats per State & City
df_us_cities_demographics.groupby(by=['State','City']).mean()#['Average Household Size'].sort_values(ascending=False)[:3]

Unnamed: 0_level_0,Unnamed: 1_level_0,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,Count
State,City,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama,Birmingham,35.6,102122.0,112789.0,214911.0,13212.0,8258.0,2.21,44294.40
Alabama,Dothan,38.9,32172.0,35364.0,67536.0,6334.0,1699.0,2.59,14058.80
Alabama,Hoover,38.5,38040.0,46799.0,84839.0,4819.0,8229.0,2.58,22062.25
Alabama,Huntsville,38.1,91764.0,97350.0,189114.0,16637.0,12691.0,2.18,40534.60
Alabama,Mobile,38.0,91275.0,103030.0,194305.0,11939.0,7234.0,2.40,40743.00
Alabama,Montgomery,35.4,94582.0,106004.0,200586.0,14955.0,9337.0,2.41,41869.60
Alabama,Tuscaloosa,29.1,47293.0,51045.0,98338.0,3647.0,4706.0,2.67,20080.60
Alaska,Anchorage,32.2,152945.0,145750.0,298695.0,27492.0,33258.0,2.77,67245.60
Arizona,Avondale,29.1,38712.0,41971.0,80683.0,4815.0,8355.0,3.18,22385.00
Arizona,Casas Adobes,44.8,30890.0,34375.0,65265.0,6601.0,7024.0,2.24,15895.00


---
### I94 Immigration Data

- This data comes from the US National Tourism and Trade Office (https://www.trade.gov/national-travel-and-tourism-office).
- This dataset contains information about immigration records such as the month and year, immigrants' gender, country of residence, birthday and occupation while in the US, port of arrival and US State, Mode of Transportation (Air, Sea, Land, ...), among others.
- For this project the data is in a folder located at ``../../data/18-83510-I94-Data-2016``.
- There is a SAS file (``sas7bdat`` format) for each month of the year.
- Below a sample of the data is used for EDA purposes.
- This dataset will serve as a basis to the fact table in our dimension model.


#### Immigration sample data (immigration_sample_data)

In [15]:
df_immigration_sample = pd.read_csv(immigration_sample_data)
df_immigration_sample.head()

Unnamed: 0.1,Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,...,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
0,2027561,4084316.0,2016.0,4.0,209.0,209.0,HHW,20566.0,1.0,HI,...,,M,1955.0,7202016,F,,JL,56582670000.0,00782,WT
1,2171295,4422636.0,2016.0,4.0,582.0,582.0,MCA,20567.0,1.0,TX,...,,M,1990.0,10222016,M,,*GA,94362000000.0,XBLNG,B2
2,589494,1195600.0,2016.0,4.0,148.0,112.0,OGG,20551.0,1.0,FL,...,,M,1940.0,7052016,M,,LH,55780470000.0,00464,WT
3,2631158,5291768.0,2016.0,4.0,297.0,297.0,LOS,20572.0,1.0,CA,...,,M,1991.0,10272016,M,,QR,94789700000.0,00739,B2
4,3032257,985523.0,2016.0,4.0,111.0,111.0,CHM,20550.0,3.0,NY,...,,M,1997.0,7042016,F,,,42322570000.0,LAND,WT


Inspecting ``I94_SAS_Labels_Descriptions.SAS``, we can draw the following data dictionary for this dataset:

<b><i>Data dictionary</i></b>

<table class="tg" align="middle">
  <tr>
    <th class="tg-0pky">Feature</th>
    <th class="tg-0pky">Description</th>
  </tr>
 <tr><td class="tg-0pky">cicid</td><td class="tg-0pky">Unique record ID</td>
 <tr><td class="tg-0pky">i94yr</td><td class="tg-0pky">4 digit year</td>
 <tr><td class="tg-0pky">i94mon</td><td class="tg-0pky">Numeric month</td>
 <tr><td class="tg-0pky">i94cit</td><td class="tg-0pky">3 digit code for immigrant country of birth</td>
 <tr><td class="tg-0pky">i94res</td><td class="tg-0pky">3 digit code for immigrant country of residence </td>
 <tr><td class="tg-0pky">i94port</td><td class="tg-0pky">Port of admission</td>
 <tr><td class="tg-0pky">arrdate</td><td class="tg-0pky">Arrival Date in the USA</td>
 <tr><td class="tg-0pky">i94mode</td><td class="tg-0pky">Mode of transportation (1 = Air; 2 = Sea; 3 = Land; 9 = Not reported)</td>
 <tr><td class="tg-0pky">i94addr</td><td class="tg-0pky">USA State of arrival</td>
 <tr><td class="tg-0pky">depdate</td><td class="tg-0pky">Departure Date from the USA</td>
 <tr><td class="tg-0pky">i94bir</td><td class="tg-0pky">Age of Respondent in Years</td>
 <tr><td class="tg-0pky">i94visa</td><td class="tg-0pky">Visa codes collapsed into three categories</td>
 <tr><td class="tg-0pky">count</td><td class="tg-0pky">Field used for summary statistics</td>
 <tr><td class="tg-0pky">dtadfile</td><td class="tg-0pky">Character Date Field - Date added to I-94 Files</td>
 <tr><td class="tg-0pky">visapost</td><td class="tg-0pky">Department of State where where Visa was issued </td>
 <tr><td class="tg-0pky">occup</td><td class="tg-0pky">Occupation that will be performed in U.S</td>
 <tr><td class="tg-0pky">entdepa</td><td class="tg-0pky">Arrival Flag - admitted or paroled into the U.S.</td>
 <tr><td class="tg-0pky">entdepd</td><td class="tg-0pky">Departure Flag - Departed, lost I-94 or is deceased</td>
 <tr><td class="tg-0pky">entdepu</td><td class="tg-0pky">Update Flag - Either apprehended, overstayed, adjusted to perm residence</td>
 <tr><td class="tg-0pky">matflag</td><td class="tg-0pky">Match flag - Match of arrival and departure records</td>
 <tr><td class="tg-0pky">biryear</td><td class="tg-0pky">4 digit year of birth</td>
 <tr><td class="tg-0pky">dtaddto</td><td class="tg-0pky">Character Date Field - Date to which admitted to U.S. (allowed to stay until)</td>
 <tr><td class="tg-0pky">gender</td><td class="tg-0pky">Non-immigrant sex</td>
 <tr><td class="tg-0pky">insnum</td><td class="tg-0pky">INS number</td>
 <tr><td class="tg-0pky">airline</td><td class="tg-0pky">Airline used to arrive in U.S.</td>
 <tr><td class="tg-0pky">admnum</td><td class="tg-0pky">Admission Number</td>
 <tr><td class="tg-0pky">fltno</td><td class="tg-0pky">Flight number of Airline used to arrive in U.S.</td>
 <tr><td class="tg-0pky">visatype</td><td class="tg-0pky">Class of admission legally admitting the non-immigrant to temporarily stay in U.S.</td>
</table>

In [16]:
# year with most data => Sample only contains data of 2016
df_immigration_sample['i94yr'].value_counts()

2016.0    1000
Name: i94yr, dtype: int64

In [33]:
# month with most data of month = 4 (April)
df_immigration_sample[['i94yr', 'i94mon']].groupby(['i94yr', 'i94mon']).count()

i94yr,i94mon
2016.0,4.0


In [25]:
# COUNTRY CODDE of most immigrants
df_immigration_sample['i94res'].value_counts(ascending=False)


135.0    119
209.0     79
582.0     58
245.0     54
112.0     52
438.0     51
111.0     48
276.0     37
689.0     37
213.0     34
123.0     33
687.0     22
691.0     19
130.0     19
268.0     16
251.0     16
131.0     14
117.0     14
158.0     13
696.0     13
116.0     13
129.0     12
577.0     10
124.0     10
692.0      9
260.0      9
575.0      7
464.0      7
104.0      7
263.0      7
        ... 
299.0      2
509.0      2
115.0      2
324.0      1
257.0      1
141.0      1
218.0      1
164.0      1
518.0      1
266.0      1
201.0      1
274.0      1
373.0      1
350.0      1
525.0      1
603.0      1
511.0      1
602.0      1
297.0      1
121.0      1
340.0      1
113.0      1
520.0      1
243.0      1
127.0      1
272.0      1
504.0      1
105.0      1
162.0      1
688.0      1
Name: i94res, Length: 91, dtype: int64