# Importing Libraries and Cleaned Datasets

In [99]:
#import kagglehub
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels as sm

county_health_rank = pd.read_csv('county_rankings25.csv')
fips_codes = pd.read_csv('all_fips.csv')
county_vars = pd.read_csv('county_vars_2.csv')
ltss_2022 = pd.read_csv('ltss_2022.csv')
race_groups = pd.read_csv('agg_race_variables.csv')

# Data Cleaning Plan
## County Health Rankings 2025
In place of the previousl 4 tables reporting socio economic metrics, I found a new Dataset 'County Health Rankings.' The data wrangling process will involve selecting the target variables, renaming, normalizing data types, validating the resulting dataframe, and applying the standardized hierarchical index (county, state).

Since dataset contains hundreds of variables, I created a small dataframe of our variables of interest and normalized names to facilate creating the subset itself and its naming conventions. 

Note: much of the numeric data is in percentages, separate subsets will include totals and percentages and will be used depnding on type of analysis and indexing needed.


## LTSS Enrollment and Expenditure tables
The state-level LTSS enrollment and expenditure tables remain and their cleaning involves: subsetting by columns neeeded reporting metrics for enrollment and expenses. These variables reflect enrollment in the first table and expenses in the second for (1)The entire LTSS program, (2) Institutional care, and (3)HCBS program. 

I will subset each table by thsese colums, rename by the conventions defined at the beginning of the section, remove invalid characters ('$', ',') from numeric columns, change the datatypes of the columns to floats for quantitative variables ans strings for 'id' variables, and set 'state' as index prior to merging.

## FIPS & States ID table
I will merge both tables above to a previosly created 'all_fips' (which serves as a sort of  'fact table'-listing county names, state names, state abbreviations, and corresponding fips codes) by index. 

I will validate the data and clean any merging errors like the creation of duplicate columns and/or changes to naming conventions (such as adding suffixes/prefixes to duplicate and original colums).

Finally, I will merge both of these new tables by the standardized index mentioned above.

# Data Cleaning

## LTSS Table

### Cleaning invalid characters in columns to prep for datatype conversion to float

In [100]:
# defining columns to keep and subsetting dataframes
ltss_cols= ['State', 'LTSS (total)']
ltss_population = ltss_2022[ltss_cols]

ltss_population['LTSS (total)'] = ltss_population['LTSS (total)'].str.replace(',', '').astype(float)
ltss_pop_names = {'State': 'state', 'LTSS (total)': 'ltss_state_enrollment'}
ltss_population = ltss_population.rename(columns=ltss_pop_names)
ltss_population.set_index(['state'])
# normalize state names
ltss_state_rename_enr = {'National': 'United States','District of\nColumbia':'District of Columbia'}
ltss_population['state'] = ltss_population['state'].replace(ltss_state_rename_enr)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ltss_population['LTSS (total)'] = ltss_population['LTSS (total)'].str.replace(',', '').astype(float)


## County Rankings tables

create subset, clean result, and merge with 'fips_codes'

In [101]:
county_health_subset = county_vars['raw_variable'].tolist()
new_names = county_vars[' new_name'].tolist()
county_health_cols = dict(zip(county_health_subset, new_names))
# creating subset and renaiming with list and dictionary from above
county_health= county_health_rank[county_health_subset]
county_health= county_health.rename(columns=county_health_cols)
# normalizing fips codes
county_health['fips'] = county_health['fips'].astype('str')
fips_codes['fips'] = fips_codes['fips'].astype('str')
#merging with fips codes
us_counties = pd.merge(county_health, fips_codes, how='outer', on='fips')
# creating new dataframe without non-county data to keep original dataframe for 'non_county' data subset
us_counties_updated = us_counties.dropna(subset=['state'])
# these are states, the US, and 'planning' regions
non_counties = us_counties[us_counties['state'].isnull()]

### clean new table

In [102]:
def clean_merge(us_counties_updated):
    # Drop columns: 'county_x', 'state_abbr_x'
    us_counties_updated = us_counties_updated.drop(columns=['county_x', 'state_abbr_x'])
    # Rename column 'county_y' to 'county'
    us_counties_updated = us_counties_updated.rename(columns={'county_y': 'county'})
    # Rename column 'state_abbr_y' to 'state_abbr'
    us_counties_updated = us_counties_updated.rename(columns={'state_abbr_y': 'state_abbr'})
    return us_counties_updated

#### apply function, create list of states and 'us' to remove from county colums, and round values

In [103]:
us_counties_updated = clean_merge(us_counties_updated)
us_counties_updated = us_counties_updated.round(2)
state_names = fips_codes['state'].unique()
state_names = state_names.tolist()
state_names.append('United States')
us_counties_updated = us_counties_updated[~us_counties_updated['county'].isin(state_names)]

# subsets and groupings for drill-down analysis 

In [104]:
index = ['state_abbr','county','fips']

## Variable-based subsets

### Inequity Metrics

In [105]:
inequity_vars = ['state_abbr','county','fips', 'income_inequality', 'gender_pay_gap', 'school_segration_index', 'residential_segregation','school_funding_gap']
inequity_df = us_counties_updated[inequity_vars]

### Socio-economic indicators with racial breakdown

In [106]:
racial_category_subset = race_groups['raw_variable'].tolist()
racial_category_names = race_groups['new_name'].tolist()
racial_category_cols = dict(zip(racial_category_subset, racial_category_names))
race_categories = county_health_rank[racial_category_subset]

# cleaning new df:renaming with list and dictionary from above, changing dytypes, and rounding
race_categories= race_categories.rename(columns=racial_category_cols)
race_categories['fips'] = race_categories['fips'].astype('str')
race_categories = race_categories.round(2)
race_categories =race_categories.fillna(0)

In [107]:
# reshape dataframe
# Melt the dataframe to long format
rc_long = race_categories.melt(
    id_vars=['state_abbr', 'county', 'fips'], 
    var_name='metric_race', 
    value_name='value'
)

# Split the combined 'metric_ethnicity' column
rc_long[['metric', 'race']] = rc_long['metric_race'].str.extract(r'(.+?)-(.+)')

# Drop the original combined column
rc_long = rc_long.drop(columns='metric_race')

# Pivot to have metrics as columns
race_metrics = rc_long.pivot_table(
    index=['state_abbr', 'county', 'fips', 'race'],
    columns='metric',
    values='value'
).reset_index()

# Clean up column names if needed
race_metrics.columns.name = None


In [None]:
# clean output 
race_metrics = race_metrics[~race_metrics['county'].isin(state_names)]
#race_metrics.set_index(['state_abbr', 'county', 'fips', 'race'], inplace=True)
race_metrics = race_metrics.fillna(0)
race_metrics = race_metrics.rename(columns={'child_poverty':'child_poverty_percentage'})

### County-Level socio-economic indicators (no racial breakdown)

In [110]:
county_socio_econ = us_counties_updated.drop(columns=['income_inequality', 'gender_pay_gap', 'school_segration_index', 'residential_segregation','school_funding_gap', 'child_mortality',
       'child_poverty_percentage', 'firearm_fatalities', 'life_expectancy',
       'median_income', 'population', 'premature_death'])

## Total Values (vs percentages)

In [None]:
pop_vars = ['state_abbr','county','fips', 'population']
county_pop = us_counties_updated[pop_vars]
# find columns w percentage ouputs
percent_str = 'percentage'
percent_vars = list(filter(lambda x: percent_str in x, new_names))
us_counties_totals = us_counties_updated

#### totals function

In [None]:
def calc_total_pop(df, cols):
    for index, row in df.


In [None]:
us_counties_totals['hs_graduates_total'] = us_counties_updated['hs_diploma_percentage']* us_counties_updated['population']
us_counties_totals['unemployed_total'] = us_counties_updated['unemployed_percentage']* us_counties_updated['population']
us_counties_totals['uninsured_total'] = us_counties_updated['uninsured__percentage']* us_counties_updated['population']
us_counties_totals['high_housing_cost_total'] = us_counties_updated['high_housing_cost_percentage']* us_counties_updated['population']
us_counties_totals['child_poverty_total'] = us_counties_updated['child_poverty_percentage']* us_counties_updated['population']
us_counties_totals = us_counties_totals.drop(columns=['hs_diploma_percentage', 'unemployed_percentage', 'uninsured__percentage', 'high_housing_cost_percentage', 'child_poverty'])

## Merge for Analysis

In [131]:
test_race_metrics = race_metrics
test_inequitydf = inequity_df
test_race_metrics.reset_index(inplace=True)
test_inequitydf.reset_index(inplace=True)

In [132]:
race_inequity_df = pd.merge(test_race_metrics, test_inequitydf, on=['state_abbr','county','fips'], how='left')

# Insights

## Basic Descriptive Stats

## Correlation and distributions

### Aggregate Data

In [None]:
matrix = us_counties_updated.corr(method='spearman', numeric_only=True).round(2)
sns.heatmap(matrix)

In [None]:
us_counties_updated['population '] = us_counties_updated['population '].rename('population')

### Probability distributions

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(8, 6), sharey=True)

axes[0, 0].hist(us_counties_updated['income_inequality'], bins=30, color='skyblue', alpha=0.7)
axes[0, 1].hist(us_counties_updated['median_income'], bins=30, color='lightgreen', alpha=0.7)
axes[0, 2].hist(us_counties_updated['child_care_cost_burden'], bins=30, color='salmon', alpha=0.7)
axes[1, 0].hist(us_counties_updated['unemployed_percentage'], bins=30, color='lightcoral', alpha=0.7)
axes[1, 1].hist(us_counties_updated['premature_death'], bins=30, color='gold', alpha=0.7)
axes[1, 2].hist(us_counties_updated['child_poverty'], bins=30, color='lightblue', alpha=0.7)
axes[2, 0].hist(us_counties_updated['child_mortality'], bins=30, color='lightpink', alpha=0.7)
axes[2, 1].hist(us_counties_updated['school_funding_gap'], bins=30, color='lightyellow', alpha=0.7)
axes[2, 2].hist(us_counties_updated['gender_pay_gap'], bins=30, color='lightgray', alpha=0.7)

axes[0, 0].set_title('Income Inequality')
axes[0, 1].set_title('Median Income')
axes[0, 2].set_title('Child Care Cost Burden')
axes[1, 0].set_title('Unemployed Percentage')
axes[1, 1].set_title('Premature Death')
axes[1, 2].set_title('Child Poverty')
axes[2, 0].set_title('Child Mortality')
axes[2, 1].set_title('School Funding Gap')
axes[2, 2].set_title('gender Pay Gap')
plt.tight_layout()
plt.show()

## Drill down on race

In [None]:
race_subset = race_metrics.dropna

In [None]:
sns.histplot(data=race_metrics, x='child_poverty', hue='race', multiple='stack', element='step')

In [None]:
race_metrics.head()

### Comparing distributions between income inequality groupings: 'above_median' and 'below_median'

In [None]:
print(us_counties_updated['income_inequality'].median())