In [4]:
import pandas as pd
import os
from pathlib import Path, PureWindowsPath
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt 

# Load files in

In [5]:
filename_1 = 'FEMA_claims.csv.gz'
filename_2 = 'ASEC_income.csv.gz'
filename_3 = 'ZHVI.csv.gz'
url = 'https://raw.githubusercontent.com/nollijish/datasets/main/'
filepath_1 = url + filename_1
filepath_2 = url + filename_2
filepath_3 = url + filename_3

In [6]:
df_fema = pd.read_csv(filepath_1,
                      on_bad_lines = 'warn',
                      low_memory = False
                     )
df_asec = pd.read_csv(filepath_2,
                      on_bad_lines = 'warn',
                      low_memory = False
                     )
df_zhvi = pd.read_csv(filepath_3,
                      on_bad_lines = 'warn',
                      low_memory = False
                     )

# Aggregate FEMA data down to year for y-o-y analysis

In [None]:
key = ['state','county','year']

In [None]:
df_fema.info()

In [None]:
df_fema.sample(3)

In [None]:
try:
    df_fema['year'] = pd.to_datetime(arg=df_fema.loc[:,'date']
                                     ,errors='raise'
                                     ,format="%Y-%m-%d"
                                    ).dt.year
except:
    print('that ain\'t it man')

In [None]:
df_fema.info()

In [None]:
df2_fema = df_fema.groupby(by=['state','county','year'], as_index=False)\
                    .agg(tot_reim = ('reimbursements','sum')
                         ,cnt_req = ('reimbursements','count')
                        ).dropna()

In [None]:
df2_fema.sample(3)

In [None]:
df2_fema.info()

# Fit a model for each year to cluster into: low,med,high

In [None]:
def hmmm(i,year,df,ax):
    df_temp = df.loc[df.loc[:,'year']==y,:].copy()
    X = pd.DataFrame(MinMaxScaler().fit_transform(df_temp.loc[:,['tot_reim','cnt_req']])
                     ,columns=['tot_reim','cnt_req']
                    )
    n = 4
    mdl = KMeans(n_clusters=n,n_init='auto').fit(X)
    
    df_temp.loc[:,'cluster'] = mdl.labels_
    df_temp = df_temp.astype({'cluster':'int8'})

    X_ = pd.DataFrame(mdl.cluster_centers_
                      ,columns=['tot_reim','cnt_req']
                     )
    X_.loc[:,'cluster'] = pd.Series(np.arange(0,n))
    X_.loc[:,'cluster_l2'] = (X_.loc[:,'tot_reim']**2 + X_.loc[:,'cnt_req']**2)**0.5
    X_.loc[:,'risk_rnk'] = X_.loc[:,'cluster_l2'].rank().astype({'cluster_l2':'int8'})
    
    df_temp = pd.merge(df_temp
                       ,X_.loc[:,['cluster','risk_rnk']]
                       ,how = 'left'
                       ,left_on = 'cluster'
                       ,right_on = 'cluster'
                      )
    
    sns.set_style('whitegrid')
    sns.scatterplot(data = df_temp
                    ,x = 'tot_reim'
                    ,y = 'cnt_req'
                    ,hue = 'risk_rnk'
                    ,palette = 'Set2'
                    ,ax=ax
                   )
    ax.set(ylabel = 'Reimbursement Requests'
           ,xlabel = 'Reimbursement Costs (USD)'
           ,xscale = 'log'
           ,yscale = 'log'
           ,title = 'Year: {}'.format(str(y))
          )
    
    return df_temp.drop(columns=['cluster']), ax

In [None]:
k = 3
h = int(18/k)

fig, ax = plt.subplots(h, k, figsize=(15, h*3.5))

import warnings
warnings.filterwarnings('ignore')

for i,y in enumerate(range(2005,2023)):
    blah, ax[int(i/k),i%k] = hmmm(i,y,df2_fema,ax[int(i/k),i%k])
    if i==0:
        df3_fema = blah
    else:
        df3_fema = pd.concat([df3_fema,blah])
    
fig.subplots_adjust(hspace=0.4,wspace=0.2)
fig.suptitle('State,County FEMA Flood Risk Clustering (4=very high,3=high,2=med,1=low)',y=0.915)

path = os.path.join(os.getcwd(),'figs')
filename = 'y-o-y_cluster_all' + '.png'

if not os.path.exists(path):
    os.makedirs(path)
plt.savefig(fname=os.path.join(path,filename))
plt.close(fig)

In [None]:
filt_ugh = df3_fema.loc[:,'risk_rnk'] == 4
df3_fema.loc[filt_ugh,:].sample(3)

In [None]:
df2_fema.size

In [None]:
df3_fema.size

# Assess state,county level risk

In [None]:
dfgb_risk_mu = df3_fema.groupby(by=['state','county'], as_index=False)\
                        .agg(mean_risk = ('risk_rnk','mean')
                             ,cnt_req_tot = ('cnt_req','sum')
                            )

In [None]:
dfgb_risk_mu.nlargest(5,'mean_risk').to_csv('highest_avg_risk.csv',index = False)
dfgb_risk_mu.nlargest(5,'mean_risk')

In [None]:
dfgb_risk_mu.nlargest(5,'cnt_req_tot').to_csv('highest_num_reimbursements.csv',index = False)
dfgb_risk_mu.nlargest(5,'cnt_req_tot')