In [40]:
# imports

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, adjusted_rand_score
import plotly.express as px




In [4]:
# load the data
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,statename,year,month,spend apparel and accessories,spend accommodation and food services,"spend arts, entertainment, and recreation",spend all,spend general merchandise + apparel,spend durable goods,spend general merchandise,...,day,time retail and recreation,time grocery and pharmacy,time parks,time transit stations,time workplaces,time residential,time away from home,spend recovered,emp recovered
0,Alabama,2020,1,0.0,-1.91e-09,0.0,0.0,-2.88e-08,3.28e-08,0.0,...,,,,,,,,,True,False
1,Alabama,2020,2,-0.0438,0.0216,0.0643,0.0278,0.0202,0.0125,0.0782,...,29.0,0.09,0.0414,0.191,0.107,0.01,-0.00571,0.00855,True,True
2,Alabama,2020,3,-0.351,-0.38,-0.373,-0.0898,-0.146,-0.0385,0.0646,...,31.0,-0.334,-0.0643,0.18,-0.234,-0.344,0.131,-0.159,False,False
3,Alabama,2020,4,-0.535,-0.606,-0.652,-0.229,-0.319,0.0153,-0.0916,...,30.0,-0.24,-0.03,0.143,-0.23,-0.347,0.121,-0.148,False,False
4,Alabama,2020,5,-0.182,-0.382,-0.472,-0.101,-0.0727,0.172,0.0279,...,31.0,-0.0914,0.0314,0.373,-0.0371,-0.236,0.0829,-0.0943,False,False


In [5]:
data.columns

Index(['statename', 'year', 'month', 'spend apparel and accessories',
       'spend accommodation and food services',
       'spend arts, entertainment, and recreation', 'spend all',
       'spend general merchandise + apparel', 'spend durable goods',
       'spend general merchandise', 'spend grocery and food stores',
       'spend health care and social assistance',
       'spend home improvement centers', 'spend other in-person services',
       'spend non-durable goods', 'spend remote services',
       'spend transportation and warehousing', 'spend all_incmiddle',
       'spend all q1', 'spend all q2', 'spend all q3', 'spend all q4',
       'spend in-person services', 'spend retail excluding grocery',
       'spend retail including grocery', 'day_endofweek', 'emp all', 'emp q1',
       'emp q2', 'emp q3', 'emp q4', 'emp inc middle', 'emp inc below_median',
       'emp inc above median', 'emp trade_transport utilities',
       'emp professional business_services', 'emp education hea

### 1. data prep

In [46]:
# filter data for january months after 2020
df_jan = data[(data['month'] == 3) & (data['year'] > 2020)]

# select relevant features for clustering
feats = ['spend recovered', 'vaccine_rate', 
         'case_rate', 'death_rate', 'hospitalized_rate']

X = df_jan[feats].dropna()

# keep labels for comparison
labels_spend = df_jan.loc[X.index, 'spend recovered']
labels_emp = df_jan.loc[X.index, 'emp recovered']

### 2. standardize features

In [47]:
# standardize the data using scaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

### 3. fit k-means

In [54]:
# define k
k = 3

# fit kmeans
kmeans = KMeans(n_clusters=k, random_state=8)
df_jan = df_jan.loc[X.index]
df_jan['cluster'] = kmeans.fit_predict(X_scaled)

centroids = pd.DataFrame(scaler.inverse_transform(kmeans.cluster_centers_), columns=feats)
centroids['cluster'] = range(k)
centroids



Unnamed: 0,spend recovered,vaccine_rate,case_rate,death_rate,hospitalized_rate,cluster
0,0.944444,83.98405,24598.068996,212.797401,6.938056,0
1,0.980392,22.702151,8822.852625,149.912587,11.272612,1
2,1.0,74.391544,29655.764418,358.851417,8.057732,2


In [55]:
state_abbrev = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT',
    'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN',
    'Iowa': 'IA', 'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA',
    'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV',
    'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND',
    'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC', 'South Dakota': 'SD',
    'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV',
    'Wisconsin': 'WI', 'Wyoming': 'WY', 'District of Columbia': 'DC'
}

df_jan['state_abbrev'] = df_jan['statename'].map(state_abbrev)

fig = px.choropleth(
    df_jan,
    locations='state_abbrev',
    locationmode='USA-states',
    color='cluster',
    scope='usa',
    title='State Clusters by Economic Recovery Patterns',
    color_continuous_scale='Viridis'
)
fig.show()
