# Importing Libraries and Cleaned Datasets

In [98]:
#import kagglehub
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import statsmodels as sm

county_health_rank = pd.read_csv('county_rankings25.csv')
fips_codes = pd.read_csv('all_fips.csv')
county_vars = pd.read_csv('county_rank_vars.csv')
ltss_2022 = pd.read_csv('ltss_2022.csv')
ltss_expenses = pd.read_csv('ltss_expenses.csv')

# Data Cleaning Plan
## County Health Rankings 2025
In place of the previousl 4 tables reporting socio economic metrics, I found a new Dataset 'County Health Rankings.' The data wrangling process will involve selecting the target variables, renaming, normalizing data types, validating the resulting dataframe, and applying the standardized hierarchical index (county, state).

Since dataset contains hundreds of variables, I created a small dataframe of our variables of interest and normalized names to facilate creating the subset itself and its naming conventions. 


## LTSS Enrollment and Expenditure tables
The state-level LTSS enrollment and expenditure tables remain and their cleaning involves: subsetting by columns neeeded reporting metrics for enrollment and expenses. These variables reflect enrollment in the first table and expenses in the second for (1)The entire LTSS program, (2) Institutional care, and (3)HCBS program. 

I will subset each table by thsese colums, rename by the conventions defined at the beginning of the section, remove invalid characters ('$', ',') from numeric columns, change the datatypes of the columns to floats for quantitative variables ans strings for 'id' variables, and set 'state' as index prior to merging.

## FIPS & States ID table
I will merge both tables above to a previosly created 'all_fips' (which serves as a sort of  'fact table'-listing county names, state names, state abbreviations, and corresponding fips codes) by index. 

I will validate the data and clean any merging errors like the creation of duplicate columns and/or changes to naming conventions (such as adding suffixes/prefixes to duplicate and original colums).

Finally, I will merge both of these new tables by the standardized index mentioned above.

# Data Cleaning

## County Rankings tables

create subset, clean result, and merge with 'fips_codes'

In [99]:
county_health_subset = county_vars['raw_variable'].tolist()
new_names = county_vars[' new_name'].tolist()
county_health_cols = dict(zip(county_health_subset, new_names))
# creating subset and renaiming with list and dictionary from above
county_health= county_health_rank[county_health_subset]
county_health= county_health.rename(columns=county_health_cols)

In [100]:
# normalizing fips codes
county_health['fips'] = county_health['fips'].astype('str')
fips_codes['fips'] = fips_codes['fips'].astype('str')

In [101]:
#result = pd.merge(left, right, how="outer", on=["key1", "key2"])
us_counties = pd.merge(county_health, fips_codes, how='outer', on='fips')
# creating new dataframe without non-county data to keep original dataframe for 'non_county' data subset
us_counties_updated = us_counties.dropna(subset=['state'])
# these are states, the US, and 'planning' regions
non_counties = us_counties[us_counties['state'].isnull()]

### clean new table

In [102]:
def clean_merge(us_counties_updated):
    # Drop columns: 'county_x', 'state_abbr_x'
    us_counties_updated = us_counties_updated.drop(columns=['county_x', 'state_abbr_x'])
    # Rename column 'county_y' to 'county'
    us_counties_updated = us_counties_updated.rename(columns={'county_y': 'county'})
    # Rename column 'state_abbr_y' to 'state_abbr'
    us_counties_updated = us_counties_updated.rename(columns={'state_abbr_y': 'state_abbr'})
    return us_counties_updated

In [103]:
us_counties_updated = clean_merge(us_counties_updated)

## LTSS Tables

In [105]:
# defining columns to keep and subsetting dataframes
ltss_cols= ['State', 'LTSS (total)', 'Institutional (total)', 'HCBS (total)']
ltss_population = ltss_2022[ltss_cols]
ltss_expenditures = ltss_expenses[ltss_cols]

### Cleaning invalid characters in columns to prep for datatype conversion to float

In [106]:
# removing '$' from ltss)_expenditures and changing to float
ltss_expenditures['LTSS (total)'] = ltss_expenditures['LTSS (total)'].str[1:]
ltss_expenditures['Institutional (total)'] = ltss_expenditures['Institutional (total)'].str[1:]
ltss_expenditures['HCBS (total)'] = ltss_expenditures['HCBS (total)'].str[1:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ltss_expenditures['LTSS (total)'] = ltss_expenditures['LTSS (total)'].str[1:]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ltss_expenditures['Institutional (total)'] = ltss_expenditures['Institutional (total)'].str[1:]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ltss_expenditures['HCBS (total)

In [107]:
# removing ',' from ltss)_expenditures and changing to float
ltss_expenditures['LTSS (total)'] = ltss_expenditures['LTSS (total)'].replace(',', '', regex=True).astype(float)
ltss_expenditures['Institutional (total)'] = ltss_expenditures['Institutional (total)'].replace(',', '', regex=True).astype(float)
ltss_expenditures['HCBS (total)'] = ltss_expenditures['HCBS (total)'].replace(',', '', regex=True).astype(float)

# removing ',' and changing to float - ltss_population
ltss_population['LTSS (total)'] = ltss_population['LTSS (total)'].str.replace(',', '').astype(float)
ltss_population['Institutional (total)'] = ltss_population['Institutional (total)'].str.replace(',', '').astype(float)
ltss_population['HCBS (total)'] = ltss_population['HCBS (total)'].str.replace(',', '').astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ltss_expenditures['LTSS (total)'] = ltss_expenditures['LTSS (total)'].replace(',', '', regex=True).astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ltss_expenditures['Institutional (total)'] = ltss_expenditures['Institutional (total)'].replace(',', '', regex=True).astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

### Renaming columns

In [108]:
ltss_pop_names = {'State': 'state', 'LTSS (total)': 'ltss_total_enrollment', 'Institutional (total)': 'insitutional_enrollment', 'HCBS (total)': 'hcbs_enrollment'}
ltss_expense_names = { 'State':'state', 'LTSS (total)': 'ltss_total_expense', 'Institutional (total)': 'insitutional_expense', 'HCBS (total)': 'hcbs_expense'}

ltss_population = ltss_population.rename(columns=ltss_pop_names)
ltss_expenditures = ltss_expenditures.rename(columns=ltss_expense_names)

### Prepping for merge 
- Setting index to 'state'
- renaming DC (different spellings in each dataset) to uniform spelling, and 'National'to 'United States' for easier slicing later on 

In [109]:
ltss_population.set_index(['state'])
ltss_expenditures.set_index(['state'])

Unnamed: 0_level_0,ltss_total_expense,insitutional_expense,hcbs_expense
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
National,200375800000.0,71000480000.0,129375300000.0
Alabama,1794100000.0,1143893000.0,650206200.0
Alaska,592077800.0,222700500.0,369377300.0
Arizona,2641568000.0,720799100.0,1920769000.0
Arkansas,1752396000.0,1128938000.0,623457900.0
California,16148610000.0,7738301000.0,8410313000.0
Colorado,3443419000.0,688824000.0,2754595000.0
Connecticut,3652255000.0,1560325000.0,2091930000.0
Delaware,847212000.0,280967400.0,566244500.0
District of Columbia,1174536000.0,434164900.0,740371200.0


In [110]:
ltss_state_rename_enr = {'National': 'United States','District of\nColumbia':'District of Columbia'}
ltss_state_rename_exp = {'National': 'United States','District of Columbia':'District of Columbia'}
ltss_expenditures['state'] = ltss_expenditures['state'].replace(ltss_state_rename_exp)
ltss_population['state'] = ltss_population['state'].replace(ltss_state_rename_enr)

In [111]:
ltss_combined = pd.merge(ltss_population, ltss_expenditures, on='state', how='outer')
ltss_combined.set_index('state')

Unnamed: 0_level_0,ltss_total_enrollment,insitutional_enrollment,hcbs_enrollment,ltss_total_expense,insitutional_expense,hcbs_expense
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alabama,97317.0,22596.0,77260.0,1794100000.0,1143893000.0,650206200.0
Alaska,18458.0,1574.0,17157.0,592077800.0,222700500.0,369377300.0
Arizona,161936.0,23312.0,142668.0,2641568000.0,720799100.0,1920769000.0
Arkansas,59121.0,22589.0,37853.0,1752396000.0,1128938000.0,623457900.0
California,462458.0,131303.0,343732.0,16148610000.0,7738301000.0,8410313000.0
Colorado,114536.0,13464.0,103693.0,3443419000.0,688824000.0,2754595000.0
Connecticut,82124.0,23733.0,63934.0,3652255000.0,1560325000.0,2091930000.0
Delaware,25773.0,4518.0,22276.0,847212000.0,280967400.0,566244500.0
District of Columbia,22421.0,5506.0,17517.0,1174536000.0,434164900.0,740371200.0
Florida,231965.0,69337.0,169922.0,4910445000.0,1658581000.0,3251865000.0


## Merge for analysis

In [113]:
counties_w_ltss_state = pd.merge(us_counties_updated, ltss_combined, on='state', how='left')

### clean result

In [115]:
# drop emplty col(living_wage) and rename ltss values to clarify these are state values (not county)
us_counties_and_ltss = counties_w_ltss_state.drop(columns=['living_wage'])
us_counties_and_ltss = us_counties_and_ltss.rename(columns={'ltss_total_enrollment': 'state_ltss_enrollment', 'insitutional_enrollment': 'state_insitutional_enrollment', 'hcbs_enrollment': 'state_hcbs_enrollment', 'ltss_total_expense': 'state_ltss_expense', 'insitutional_expense': 'state_insitutional_expense', 'hcbs_expense': 'state_hcbs_expense'})

In [None]:
#us_counties_and_ltss.to_csv('us_counties_and_ltss.csv')