In [1]:
# Imports Libs for:

# Web scraping
import requests
from bs4 import BeautifulSoup
from myfuncs import * # Self-defined functions for pulling data from specific sites
from dotenv import load_dotenv
import os

# Data analysis
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score #, calinski_harabasz_score

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import geopandas as gpd

# Set random seed
np.random.seed(42)




# Data Extraction & Preprocessing

## State-level Data

### FRED API

In [2]:
url = 'https://fred.stlouisfed.org/release/tables?rid=249&eid=259462' # Median household income by state
variable = 'Median Household Income'

income_df = extract_FRED_data(url,variable)
income_df.head()

Unnamed: 0,State Name,Median Household Income [Current Dollars],Year
0,The United States,80610.0,2023
1,Alabama,60660.0,2023
2,Alaska,98190.0,2023
3,Arizona,82660.0,2023
4,Arkansas,63250.0,2023


In [3]:
url = 'https://fred.stlouisfed.org/release/tables?eid=840687&rid=116'
variable = 'Unemployment Rate'
UE_df = extract_FRED_data(url,variable)
UE_df.head()

Unnamed: 0,State Name,Unemployment Rate [Percent],Year
0,Alabama,2.5,2023
1,Alaska,4.2,2023
2,Arizona,3.9,2023
3,Arkansas,3.3,2023
4,California,4.8,2023


In [4]:
url = 'https://fred.stlouisfed.org/release/tables?eid=259194&rid=118'
variable = 'Population'
pop_df = extract_FRED_data(url,variable)
pop_df.head()

Unnamed: 0,State Name,Population [Thousands of Persons],Year
0,Alabama,5108.468,2023
1,Alaska,733.406,2023
2,Arizona,7431.344,2023
3,Arkansas,3067.732,2023
4,California,38965.193,2023


In [17]:
# Compile FRED data: income_df, UE_df, pop_df

fred_df = pd.merge(income_df, UE_df, on=['State Name', 'Year'], how='inner') \
           .merge(pop_df, on=['State Name', 'Year'], how='inner')
# fred_df.to_csv('assets/FRED_data.csv', index=None)
# fred_df = pd.read_csv('assets/FRED_data.csv')
fred_df.head()

Unnamed: 0,State Name,Median Household Income [Current Dollars],Year,Unemployment Rate [Percent],Population [Thousands of Persons]
0,Alabama,60660.0,2023,2.5,5108.468
1,Alaska,98190.0,2023,4.2,733.406
2,Arizona,82660.0,2023,3.9,7431.344
3,Arkansas,63250.0,2023,3.3,3067.732
4,California,89870.0,2023,4.8,38965.193


### US Census Bureau: American Community Survey (ACS)

In [6]:
api_key_USCB = os.getenv('API_KEY_USCB')

acs_2023_df = extract_and_preprocess_ACS_data(api_key=api_key_USCB, year=2023, state_code=None)
acs_2022_df =  extract_and_preprocess_ACS_data(api_key=api_key_USCB, year=2022, state_code=None)

In [14]:
acs_df = pd.concat([acs_2023_df,acs_2023_df],ignore_index=True)
acs_df.head()

Unnamed: 0,Median Household Income,Per Capita Income,Gini Index of Income Inequality,Total Population,Median Age,Median Home Value,State Code (FIPS),Year,Unemployment Rate,Percent Foreigners
0,62212.0,35046.0,0.4771,5108468.0,39.6,216600.0,1,2023,0.018804,0.039766
1,86631.0,45792.0,0.4492,733406.0,36.5,347500.0,2,2023,0.023868,0.074408
2,77315.0,41290.0,0.465,7431344.0,39.3,411200.0,4,2023,0.020882,0.131999
3,58700.0,33012.0,0.474,3067732.0,38.9,195700.0,5,2023,0.019464,0.052919
4,95521.0,48013.0,0.487,38965193.0,38.2,725800.0,6,2023,0.028509,0.273065


### State Information

In [12]:
# Map state FIPS code to state names

state_df = extract_state_mapper()
# state_df.to_csv('assets/state_info.csv', index=None)
# state_df = pd.read_csv('assets/state_info.csv')
state_df.head()

Unnamed: 0,State Name,State Code (FIPS),State Code (USPS)
0,Alabama,1,AL
1,Alaska,2,AK
2,Arizona,4,AZ
3,Arkansas,5,AR
4,California,6,CA


In [16]:
# Merge state name to acs_df based on FIPS code
acs_df= pd.merge(acs_df, state_df, on = 'State Code (FIPS)', how = 'left').dropna()
# acs_df.to_csv('assets/acs_df.csv',index=None)
# acs_df = pd.read_csv('assets/acs_df.csv')
acs_df.head()

Unnamed: 0,Median Household Income,Per Capita Income,Gini Index of Income Inequality,Total Population,Median Age,Median Home Value,State Code (FIPS),Year,Unemployment Rate,Percent Foreigners,State Name,State Code (USPS)
0,62212.0,35046.0,0.4771,5108468.0,39.6,216600.0,1,2023,0.018804,0.039766,Alabama,AL
1,86631.0,45792.0,0.4492,733406.0,36.5,347500.0,2,2023,0.023868,0.074408,Alaska,AK
2,77315.0,41290.0,0.465,7431344.0,39.3,411200.0,4,2023,0.020882,0.131999,Arizona,AZ
3,58700.0,33012.0,0.474,3067732.0,38.9,195700.0,5,2023,0.019464,0.052919,Arkansas,AR
4,95521.0,48013.0,0.487,38965193.0,38.2,725800.0,6,2023,0.028509,0.273065,California,CA


## Air Travel Data

In [27]:
# Domestic flight by origin airports (2023)
# Data: Flight volume by Airport Code; this will be our main dataset
# Extraction: csv download from https://equity-data.dot.gov/datasets/17e9a793c7cf47c8b64dab92da55dfe5/about

fp_flights = 'assets/T100_Domestic_Market_and_Segment_Data_-3591723781169319541.csv'
df_flights = pd.read_csv(fp_flights)
df_flights.columns = df_flights.columns.str.title()
df_flights.rename(columns={'Origin': 'Airport Code'}, inplace=True)
df_flights.head() # Here, origin airport names ('origin') are abbreviated by the respective Airport Codes.

Unnamed: 0,Objectid,Year,Airport Code,Enplanements,Passengers,Departures,Arrivals,Freight,Mail
0,1,2023,ATL,44721151,44940145,340757,340435.0,320592384,17033732
1,2,2023,DEN,35892655,36089755,307789,307873.0,257037213,12698972
2,3,2023,DFW,33853033,33892107,297097,297129.0,442306764,22686642
3,4,2023,ORD,29305603,29379846,303030,302931.0,386208675,29107924
4,5,2023,LAS,26216256,26443518,202540,202475.0,89632420,2773334


In [None]:
# Map airports to their respective states based on Airport Code

## EDA

In [None]:
# Filter state-level socio-demographic features based on the presence/absence of correlation with enplanements
numeric_cols = ['Enplanements']
flight_df = 

In [25]:
# EDA
# df_flights[df_flights.isna().any(axis=1)] # Mainly NA for Arrivals column

numeric_cols = ['Enplanements', 'Passengers', 'Departures', 'Arrivals', 'Freight', 'Mail']
df_flights_cleaned = df_flights[numeric_cols].dropna()

# df_flights_cleaned_sample = df_flights_cleaned.sample(100)
# sns.pairplot(df_flights_cleaned_sample[numeric_cols]) #looks to have outliers (extreme high end), let's use StandardScaler

df_flights_cleaned_normalized = df_flights_cleaned.copy()
scaler = StandardScaler()
df_flights_cleaned_normalized[numeric_cols] = scaler.fit_transform(df_flights_cleaned_normalized[numeric_cols])
df_flights_cleaned_normalized_sample = df_flights_cleaned_normalized.sample(100)

# Define temporary df for plotting
# tmp = df_flights_cleaned_normalized_sample.copy()
tmp = df_flights_cleaned_normalized.copy()

# Replot SPLOM
# sns.pairplot(tmp[numeric_cols]) 
# plt.savefig('visualizations/flights_sampled100_normalized_SPLOMs.png', dpi=300)
# plt.show()

# See the relationship between number of departing passengers vs. departed flights 
# tmp.plot(x='Departures',y='Enplanements', kind='scatter')
# plt.savefig('visualizations/flights_normalized_Enplanements_vs_Depatures.png', dpi=300)

# Load saved SPLOM
# img_name = 'flights_sampled100_normalized_SPLOMs'
# img_name = 'flights_normalized_Enplanements_vs_Depatures'
# img = mpimg.imread(f'visualizations/{img_name}.png')
# plt.imshow(img)
# plt.axis('off')
# plt.show()

/Users/ningnong/Desktop/MADS/SIADS696_MilestoneII/Air Travel Project/EDA
