In [48]:
# Imports Libs for:

# Web scraping
import requests
from bs4 import BeautifulSoup
from myfuncs import * # Self-defined functions for pulling data from specific sites
from dotenv import load_dotenv
import os

# Data analysis
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score #, calinski_harabasz_score

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import geopandas as gpd

# Set random seed
np.random.seed(42)


# Data Extraction

## State-level Data

### FRED API

In [49]:
url = 'https://fred.stlouisfed.org/release/tables?rid=249&eid=259462' # Median household income by state
variable = 'Median Household Income'

income_df = extract_FRED_data(url,variable)
income_df.head()

Unnamed: 0,State Name,Median Household Income [Current Dollars] (2023),Median Household Income [Current Dollars] (2022)
0,The United States,80610.0,74580.0
1,Alabama,60660.0,59910.0
2,Alaska,98190.0,89740.0
3,Arizona,82660.0,73450.0
4,Arkansas,63250.0,53980.0


In [40]:
url = 'https://fred.stlouisfed.org/release/tables?eid=840687&rid=116'
variable = 'Unemployment Rate'
UE_df = extract_FRED_data(url,variable)
UE_df.head()

Unnamed: 0,State Name,Unemployment Rate [Percent],Year
0,Alabama,2.5,2023
1,Alaska,4.2,2023
2,Arizona,3.9,2023
3,Arkansas,3.3,2023
4,California,4.8,2023


In [41]:
url = 'https://fred.stlouisfed.org/release/tables?eid=259194&rid=118'
variable = 'Population'
pop_df = extract_FRED_data(url,variable)
pop_df.head()

Unnamed: 0,State Name,Population [Thousands of Persons],Year
0,Alabama,5108.468,2023
1,Alaska,733.406,2023
2,Arizona,7431.344,2023
3,Arkansas,3067.732,2023
4,California,38965.193,2023


In [45]:
# Compile FRED data
# FRED dfs: income_df, UE_df, pop_df
fred_df = pd.merge(income_df, UE_df, on=['State Name', 'Year'], how='inner') \
           .merge(pop_df, on=['State Name', 'Year'], how='inner')
# fred_df.to_csv('assets/FRED_data.csv')
fred_df

Unnamed: 0,State Name,Median Household Income [Current Dollars],Year,Unemployment Rate [Percent],Population [Thousands of Persons]
0,Alabama,60660.0,2023,2.5,5108.468
1,Alaska,98190.0,2023,4.2,733.406
2,Arizona,82660.0,2023,3.9,7431.344
3,Arkansas,63250.0,2023,3.3,3067.732
4,California,89870.0,2023,4.8,38965.193
...,...,...,...,...,...
97,Virginia,85170.0,2022,2.8,8679.099
98,Washington,89430.0,2022,4.1,7784.477
99,West Virginia,52460.0,2022,3.9,1774.035
100,Wisconsin,73330.0,2022,2.9,5890.543


### US Census Bureau: American Community Survey (ACS)

In [60]:
api_key_USCB = os.getenv('API_KEY_USCB')

acs_2023_df = extract_and_preprocess_ACS_data(api_key=api_key_USCB, year=2023, state_code=None)
acs_2022_df =  extract_and_preprocess_ACS_data(api_key=api_key_USCB, year=2022, state_code=None)

In [64]:
acs_df = pd.concat([acs_2023_df,acs_2023_df],ignore_index=True)
acs_df.head()

Unnamed: 0,Median Household Income,Per Capita Income,Gini Index of Income Inequality,Total Population,Median Age,Median Home Value,State Code (FIPS),Year,Unemployment Rate,Percent Foreigners
0,62212.0,35046.0,0.4771,5108468.0,39.6,216600.0,1,2023,0.018804,0.039766
1,86631.0,45792.0,0.4492,733406.0,36.5,347500.0,2,2023,0.023868,0.074408
2,77315.0,41290.0,0.465,7431344.0,39.3,411200.0,4,2023,0.020882,0.131999
3,58700.0,33012.0,0.474,3067732.0,38.9,195700.0,5,2023,0.019464,0.052919
4,95521.0,48013.0,0.487,38965193.0,38.2,725800.0,6,2023,0.028509,0.273065
