# Importing Libraries and Cleaned Datasets

In [65]:
#import kagglehub
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels as sm

county_health_rank = pd.read_csv('county_rankings25.csv')
fips_codes = pd.read_csv('all_fips.csv')
county_vars = pd.read_csv('county_vars_2.csv')
ltss_2022 = pd.read_csv('ltss_2022.csv')

# Data Cleaning Plan
## County Health Rankings 2025
In place of the previousl 4 tables reporting socio economic metrics, I found a new Dataset 'County Health Rankings.' The data wrangling process will involve selecting the target variables, renaming, normalizing data types, validating the resulting dataframe, and applying the standardized hierarchical index (county, state).

Since dataset contains hundreds of variables, I created a small dataframe of our variables of interest and normalized names to facilate creating the subset itself and its naming conventions. 


## LTSS Enrollment and Expenditure tables
The state-level LTSS enrollment and expenditure tables remain and their cleaning involves: subsetting by columns neeeded reporting metrics for enrollment and expenses. These variables reflect enrollment in the first table and expenses in the second for (1)The entire LTSS program, (2) Institutional care, and (3)HCBS program. 

I will subset each table by thsese colums, rename by the conventions defined at the beginning of the section, remove invalid characters ('$', ',') from numeric columns, change the datatypes of the columns to floats for quantitative variables ans strings for 'id' variables, and set 'state' as index prior to merging.

## FIPS & States ID table
I will merge both tables above to a previosly created 'all_fips' (which serves as a sort of  'fact table'-listing county names, state names, state abbreviations, and corresponding fips codes) by index. 

I will validate the data and clean any merging errors like the creation of duplicate columns and/or changes to naming conventions (such as adding suffixes/prefixes to duplicate and original colums).

Finally, I will merge both of these new tables by the standardized index mentioned above.

# Data Cleaning

## LTSS Table

### Cleaning invalid characters in columns to prep for datatype conversion to float

In [66]:
# defining columns to keep and subsetting dataframes
ltss_cols= ['State', 'LTSS (total)']
ltss_population = ltss_2022[ltss_cols]

ltss_population['LTSS (total)'] = ltss_population['LTSS (total)'].str.replace(',', '').astype(float)
ltss_pop_names = {'State': 'state', 'LTSS (total)': 'ltss_state_enrollment'}
ltss_population = ltss_population.rename(columns=ltss_pop_names)
ltss_population.set_index(['state'])
# normalize state names
ltss_state_rename_enr = {'National': 'United States','District of\nColumbia':'District of Columbia'}
ltss_population['state'] = ltss_population['state'].replace(ltss_state_rename_enr)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ltss_population['LTSS (total)'] = ltss_population['LTSS (total)'].str.replace(',', '').astype(float)


## County Rankings tables

create subset, clean result, and merge with 'fips_codes'

In [67]:
county_health_subset = county_vars['raw_variable'].tolist()
new_names = county_vars[' new_name'].tolist()
county_health_cols = dict(zip(county_health_subset, new_names))
# creating subset and renaiming with list and dictionary from above
county_health= county_health_rank[county_health_subset]
county_health= county_health.rename(columns=county_health_cols)
# normalizing fips codes
county_health['fips'] = county_health['fips'].astype('str')
fips_codes['fips'] = fips_codes['fips'].astype('str')
#merging with fips codes
us_counties = pd.merge(county_health, fips_codes, how='outer', on='fips')
# creating new dataframe without non-county data to keep original dataframe for 'non_county' data subset
us_counties_updated = us_counties.dropna(subset=['state'])
# these are states, the US, and 'planning' regions
non_counties = us_counties[us_counties['state'].isnull()]

### clean new table

In [68]:
def clean_merge(us_counties_updated):
    # Drop columns: 'county_x', 'state_abbr_x'
    us_counties_updated = us_counties_updated.drop(columns=['county_x', 'state_abbr_x'])
    # Rename column 'county_y' to 'county'
    us_counties_updated = us_counties_updated.rename(columns={'county_y': 'county'})
    # Rename column 'state_abbr_y' to 'state_abbr'
    us_counties_updated = us_counties_updated.rename(columns={'state_abbr_y': 'state_abbr'})
    return us_counties_updated

In [69]:
us_counties_updated = clean_merge(us_counties_updated)
us_counties_updated = us_counties_updated.round(2)

## Drill down subsets

In [70]:
# creating subset
child_poverty_groups = pd.read_csv('child_poverty.csv')
child_poverty_subset = child_poverty_groups['raw_variable'].tolist()
child_poverty_names = child_poverty_groups[' new_name'].tolist()
child_poverty_cols = dict(zip(child_poverty_subset, child_poverty_names))
child_poverty= county_health_rank[child_poverty_subset]

# cleaning new df:renaming with list and dictionary from above, changing dytypes, and rounding
child_poverty= child_poverty.rename(columns=child_poverty_cols)
child_poverty['fips'] = child_poverty['fips'].astype('str')
child_poverty = child_poverty.round(2)
child_poverty =child_poverty.fillna(0)

## Multi-Index Tables for drill down

In [71]:
#index = ['state_abbr', 'county', 'fips']
cp_test = child_poverty
#child_poverty = child_poverty.drop(columns='population')
#child_poverty['agg_value'] = 'child_poverty'
cp_test['childpoverty_AIAN'] = cp_test['AIAN']*cp_test['population']
cp_test['childpoverty_Black'] = cp_test['Black']*cp_test['population']
cp_test['childpoverty_Hispanic'] = cp_test['Hispanic']*cp_test['population']
cp_test['childpoverty_White'] = cp_test['White']*cp_test['population']

cp_test = cp_test.drop(columns=['population'])
cp_test['childpoverty_NHOPI'] = 0
cp_test['childpoverty_Asian'] = 0
cp_test['childpoverty_Other'] = 0

In [72]:
# values are percentages
cp_test.head()

Unnamed: 0,fips,county,state_abbr,AIAN,Black,Hispanic,White,childpoverty_AIAN,childpoverty_Black,childpoverty_Hispanic,childpoverty_White,childpoverty_NHOPI,childpoverty_Asian,childpoverty_Other
0,0,United States,US,0.28,0.3,0.22,0.1,93776170.6,100474500.0,73681276.9,33491489.5,0,0,0
1,1000,Alabama,AL,0.28,0.38,0.33,0.12,1430371.04,1941218.0,1685794.44,613016.16,0,0,0
2,1001,Autauga County,AL,0.0,0.26,0.22,0.08,0.0,15688.92,13275.24,4827.36,0,0,0
3,1003,Baldwin County,AL,0.0,0.54,0.34,0.07,0.0,136893.8,86192.38,17745.49,0,0,0
4,1005,Barbour County,AL,0.0,0.49,0.6,0.04,0.0,12046.65,14751.0,983.4,0,0,0


### Reshape df

In [74]:
# Step 1: Melt the dataframe to long format
cp_long = cp_test.melt(
    id_vars=['state_abbr', 'county', 'fips'], 
    var_name='metric_ethnicity', 
    value_name='value'
)

# Step 2: Split the combined 'metric_ethnicity' column
cp_long[['metric', 'ethnicity']] = cp_long['metric_ethnicity'].str.extract(r'(.+?)_(.+)')

# Step 3: Drop the original combined column
cp_long = cp_long.drop(columns='metric_ethnicity')

# Step 4: Pivot to have metrics as columns
cp_tidy = cp_long.pivot_table(
    index=['state_abbr', 'county', 'fips', 'ethnicity'],
    columns='metric',
    values='value'
).reset_index()

# Step 5: Optional - Set a MultiIndex for hierarchical indexing
cp_tidy.set_index(['state_abbr', 'county', 'fips', 'ethnicity'], inplace=True)

# Step 6: (Optional) Clean up column names if needed
cp_tidy.columns.name = None


In [75]:
cp_tidy.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,childpoverty
state_abbr,county,fips,ethnicity,Unnamed: 4_level_1
AK,Alaska,2000,AIAN,176017.44
AK,Alaska,2000,Asian,0.0
AK,Alaska,2000,Black,249358.04
AK,Alaska,2000,Hispanic,88008.72
AK,Alaska,2000,NHOPI,0.0


In [76]:
cp_tidy.stack()

state_abbr  county   fips   ethnicity              
AK          Alaska   2000   AIAN       childpoverty    176017.44
                            Asian      childpoverty         0.00
                            Black      childpoverty    249358.04
                            Hispanic   childpoverty     88008.72
                            NHOPI      childpoverty         0.00
                                                         ...    
WY          Wyoming  56000  Black      childpoverty    245303.94
                            Hispanic   childpoverty     99289.69
                            NHOPI      childpoverty         0.00
                            Other      childpoverty         0.00
                            White      childpoverty     64246.27
Length: 22428, dtype: float64

# Insights

## Correlation Matrix and Basic Descriptive Stats

In [None]:
matrix = us_counties_updated.corr(method='spearman', numeric_only=True).round(2)
sns.heatmap(matrix)

In [None]:
us_counties_updated.describe()

In [None]:
us_counties_updated['population '] = us_counties_updated['population '].rename('population')

## Probability distributions

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(8, 6), sharey=True)

axes[0, 0].hist(us_counties_updated['income_inequality'], bins=30, color='skyblue', alpha=0.7)
axes[0, 1].hist(us_counties_updated['median_income'], bins=30, color='lightgreen', alpha=0.7)
axes[0, 2].hist(us_counties_updated['child_care_cost_burden'], bins=30, color='salmon', alpha=0.7)
axes[1, 0].hist(us_counties_updated['unemployed_percentage'], bins=30, color='lightcoral', alpha=0.7)
axes[1, 1].hist(us_counties_updated['premature_death'], bins=30, color='gold', alpha=0.7)
axes[1, 2].hist(us_counties_updated['child_poverty'], bins=30, color='lightblue', alpha=0.7)
axes[2, 0].hist(us_counties_updated['child_mortality'], bins=30, color='lightpink', alpha=0.7)
axes[2, 1].hist(us_counties_updated['school_funding_gap'], bins=30, color='lightyellow', alpha=0.7)
axes[2, 2].hist(us_counties_updated['gender_pay_gap'], bins=30, color='lightgray', alpha=0.7)

axes[0, 0].set_title('Income Inequality')
axes[0, 1].set_title('Median Income')
axes[0, 2].set_title('Child Care Cost Burden')
axes[1, 0].set_title('Unemployed Percentage')
axes[1, 1].set_title('Premature Death')
axes[1, 2].set_title('Child Poverty')
axes[2, 0].set_title('Child Mortality')
axes[2, 1].set_title('School Funding Gap')
axes[2, 2].set_title('gender Pay Gap')
plt.tight_layout()
plt.show()

### Comparing distributions between income inequality groupings: 'above_median' and 'below_median'

In [None]:
print(us_counties_updated['income_inequality'].median())

In [None]:
us_counties_updated['income_inequality_cat'] = np.where(us_counties_updated['income_inequality'] > 4.4, 'above_median', 'below_median')
us_counties_updated['income_inequality_cat'] = us_counties_updated['income_inequality_cat'].astype('category')