In [1]:
import pandas as pd
import numpy as np
from functools import reduce
from sklearn.preprocessing import OneHotEncoder

In [2]:
# read in datasets 
nsduh = pd.read_csv("./Data/nsduh_data_cleaned.csv")
hpsa = pd.read_csv('./Data/HPSA_Cleaned.csv')
cbsa = pd.read_csv("./Data/grants_per_county_cbsa.csv")

### Group HPSA and Grant Data

In [3]:
# 25th Percentile
def q25(x):
    return x.quantile(0.25)

# 75th Percentile
def q75(x):
    return x.quantile(0.75)

In [4]:
# drop unrecognized cbsas
cbsa = cbsa[~cbsa['metropolitanmicropolitanstatis'].isnull()]

# cbsa - flatten by year and pden10 mapper
cbsa_flat_df = cbsa.groupby(['Award Year','PDEN10'])\
                .agg({'Total Active Grant Financial Assistance':['median','mean',q25, q75],
                     'Mental Health Assistance':['mean']}) # mental health median, q25, q75 are 0

# flatten hierarchical index 
cbsa_flat_df.columns = [' '.join(col).strip() for col in cbsa_flat_df.columns.values]

cbsa_flat_df = cbsa_flat_df.reset_index()

In [5]:
# drop unrecognized hpsas
hpsa = hpsa[~hpsa['metropolitanmicropolitanstatis'].isnull()]

In [6]:
hpsa['Withdrawn Year'] = pd.to_datetime(hpsa['Withdrawn Date']).dt.year

In [7]:
hpsa['HPSA Designation Year'] = pd.to_datetime(hpsa['HPSA Designation Date']).dt.year

In [8]:
# clean hpsa data
cols_to_keep_cat = ['HPSA Component Type Description','HPSA Designation Population Type Description',
                            'U.S. - Mexico Border 100 Kilometer Indicator','Designation Type']
cols_to_keep_num = ['Withdrawn Year','HPSA Designation Year','PDEN10','HPSA Score','HPSA Degree of Shortage','HPSA FTE','HPSA Shortage','DaysBeforeWithdrawn']

In [9]:
enc = OneHotEncoder(drop='first')
enc.fit(hpsa[cols_to_keep_cat])
hpsa_cat_df = pd.DataFrame(enc.transform(hpsa[cols_to_keep_cat]).toarray(), columns=enc.get_feature_names(cols_to_keep_cat))


In [10]:
hpsa_cat_df = hpsa_cat_df.drop(columns=['HPSA Component Type Description_Unknown'])

In [11]:
hpsa_full_df = pd.concat([hpsa_cat_df,hpsa[cols_to_keep_num]],axis=1)

In [12]:
# define active hpsas by year 
hpsa_2015 = hpsa_full_df[(hpsa_full_df['Withdrawn Year'] >= 2015) & (hpsa_full_df['HPSA Designation Year'] <= 2015)]
hpsa_2016 = hpsa_full_df[(hpsa_full_df['Withdrawn Year'] >= 2016) & (hpsa_full_df['HPSA Designation Year'] <= 2016)]
hpsa_2017 = hpsa_full_df[(hpsa_full_df['Withdrawn Year'] >= 2017) & (hpsa_full_df['HPSA Designation Year'] <= 2017)]
hpsa_2018 = hpsa_full_df[(hpsa_full_df['Withdrawn Year'] >= 2018) & (hpsa_full_df['HPSA Designation Year'] <= 2018)]
hpsa_2019 = hpsa_full_df[(hpsa_full_df['Withdrawn Year'] >= 2019) & (hpsa_full_df['HPSA Designation Year'] <= 2019)]

In [13]:
year_dfs = [hpsa_2015, hpsa_2016, hpsa_2017, hpsa_2018, hpsa_2019]
year = [2015,2016,2017,2018,2019]
cat_cols = list(hpsa_cat_df.columns)
cat_cols.append('Year')
cat_cols.append('PDEN10')
num_cols = list(cols_to_keep_num)
num_cols.append('Year')

new_year_dfs = []
for df, year in zip(year_dfs,year):
    
    df['DaysBeforeWithdrawn'] = np.where(df['Withdrawn Year'] == year, df['DaysBeforeWithdrawn'], np.nan)
    df['DaysActive_Snapshot'] = year - df['HPSA Designation Year']
    df['Year'] = year 
    
    df_cat = df[cat_cols].groupby(['Year','PDEN10']).sum()
    df_num = df[num_cols].groupby(['Year','PDEN10']).agg(['mean','median',q25, q75])

    df = pd.concat([df_cat.reset_index(), df_num.reset_index()],axis=1)
    df.head()
    new_year_dfs.append(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DaysBeforeWithdrawn'] = np.where(df['Withdrawn Year'] == year, df['DaysBeforeWithdrawn'], np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DaysActive_Snapshot'] = year - df['HPSA Designation Year']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Year'] = year
A value is trying to b

In [14]:
final_hpsa_df = pd.concat(new_year_dfs,axis=0)

### Merge Grant, HPSA, and NSDUH Data

In [15]:
nsduh = nsduh.rename(columns={'Population_Density_2010':'PDEN10'})
cbsa_flat_df = cbsa_flat_df.rename(columns={'Award Year':'Year'})

In [16]:
# join hpsa and cbsa 
df_final = reduce(lambda left,right: pd.merge(left,right,on=['Year','PDEN10'],how='left'), [nsduh,final_hpsa_df,cbsa_flat_df])


In [17]:
df_final = df_final.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])

In [21]:
df_final[df_final['Total Active Grant Financial Assistance q25'].isnull()][['Year','PDEN10']]

Unnamed: 0,Year,PDEN10
2,2015,3
17,2015,3
70,2015,3
82,2015,3
102,2015,3
...,...,...
214458,2019,3
214462,2019,3
214476,2019,3
214496,2019,3


### Save the fully merged df

In [18]:
df_final.to_csv("./Data/NSDUH_HPSA_GRANT_finaldata.csv",index=False)