# Importing Libraries and Cleaned Datasets

In [51]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels as sm
import scipy.stats as stats
from sklearn.feature_selection import mutual_info_regression
import warnings
warnings.filterwarnings('ignore')

county_health_rank = pd.read_csv('county_rankings25.csv')
qol_22 = pd.read_csv('QOL.csv')
fips_codes = pd.read_csv('all_fips.csv')
county_vars = pd.read_csv('county_vars_2.csv')
race_groups = pd.read_csv('agg_race_variables.csv')

# Data Cleaning Plan
## County Health Rankings 2025
In place of the previousl 4 tables reporting socio economic metrics, I found a new Dataset 'County Health Rankings.' The data wrangling process will involve selecting the target variables, renaming, normalizing data types, validating the resulting dataframe, and applying the standardized hierarchical index (county, state).

Since dataset contains hundreds of variables, I created a small dataframe of our variables of interest and normalized names to facilate creating the subset itself and its naming conventions. 

Note: much of the numeric data is in percentages, separate subsets will include totals and percentages and will be used depnding on type of analysis and indexing needed.

## FIPS & States ID table
I will merge both tables above to a previosly created 'all_fips' (which serves as a sort of  'fact table'-listing county names, state names, state abbreviations, and corresponding fips codes) by index. 

I will validate the data and clean any merging errors like the creation of duplicate columns and/or changes to naming conventions (such as adding suffixes/prefixes to duplicate and original colums).

Finally, I will merge both of these new tables by the standardized index mentioned above.

# Data Cleaning

## County Rankings tables

### Make Subset Function

In [52]:
# defining variable for function to select target vars, dictionary to rename, and set 'fips' as 'str' dtype
county_health_subset = county_vars['raw_variable'].tolist()
county_name_list = county_vars[' new_name'].tolist()
county_new_names = dict(zip(county_health_subset, county_name_list))

In [54]:
def make_subset(df, target_vars, target_names):
    df = df[target_vars]
    df = df.rename(columns=target_names) #dictionary defined from target variables & names list
    df['fips'] = df['fips'].astype('str')
    return df

In [55]:
counties = make_subset(county_health_rank, county_health_subset, county_new_names)

### Clean colums function

In [56]:
# list of state names from master fips to drop from county namers column
state_names = fips_codes['state'].unique()
state_names = state_names.tolist()
state_names.append('United States')
# variable name referring to state 
def clean_subset(df):
    df = df[~df['county'].isin(state_names)]
    df = df.round(3)
    return df

In [57]:
counties = clean_subset(counties)

## QOL- Political Party 2020

In [58]:
qol_target_vars = ['2020PopulrVoteParty', 'NMCNTY', 'FIPS', 'LSTATE']
qol_new_names = {'2020PopulrVoteParty': 'political_party', 'NMCNTY':'county', 'FIPS':'fips', 'LSTATE':'state_abbr'}

In [59]:
pol_party_2020 = make_subset(qol_22, qol_target_vars, qol_new_names)

## subsets and groupings for drill-down analysis 

## Variable-based subsets

In [60]:
inequity_vars = ['state_abbr','county','fips', 'income_inequality', 'gender_pay_gap', 'school_segration_index', 'residential_segregation','school_funding_gap']
# Socio-economic indicators with racial breakdown
racial_category_subset = race_groups['raw_variable'].tolist()
racial_category_names = race_groups['new_name'].tolist()
racial_category_cols = dict(zip(racial_category_subset, racial_category_names))

In [61]:
# cleaning and reshaping dataframe with variable containing racial-breakdowns
race_subset = make_subset(county_health_rank, racial_category_subset, racial_category_cols)
counties_race = clean_subset(race_subset)
# reshape dataframe
# Melt the dataframe to long format
counties_race = counties_race.melt(
    id_vars=['state_abbr', 'county', 'fips', 'county_population'], 
    var_name='metric_race', 
    value_name='value'
)

# Split the combined 'metric_ethnicity' column
counties_race[['metric', 'race']] = counties_race['metric_race'].str.extract(r'(.+?)-(.+)')

# Drop the original combined column
counties_race = counties_race.drop(columns='metric_race')

# Pivot to have metrics as columns
counties_race = counties_race.pivot_table(
    index=['state_abbr', 'county', 'fips', 'race', 'county_population'],
    columns='metric',
    values='value'
).reset_index()

# Clean up column names if needed
counties_race.columns.name = None

### County-Level socio-economic outcomes (no racial breakdown)

In [None]:
aggregate_outcomes = counties.drop(columns=['income_inequality', 'gender_pay_gap', 'school_segration_index', 'residential_segregation','school_funding_gap', 'child_mortality', 'child_poverty_percentage', 'firearm_fatalities',
       'life_expectancy', 'median_income', 'premature_death'])
agg_vars = aggregate_outcomes.columns.to_list()
#aggregate_vars = aggregate_vars.append('political_party')
agg_vars.remove('index')
agg_vars.append('political_party')

## Total Values (vs percentages)

In [63]:
pop_vars = ['state_abbr','county','fips', 'population']
county_pop = counties[pop_vars]
# find columns w percentage ouputs by assigining target string to variable ans using lamda  func to filter col names and assign to list
percent_str = 'percentage'
percent_vars = list(filter(lambda x: percent_str in x, county_name_list))
counties_totals = counties.copy()
counties_pop_col = 'population'

#### totals function

In [64]:
# multiplies each percentage column by county polulation to yield total amounts
def calc_total_pop(df, cols, pop_col):
    for i in df[cols]:
        df[i] = df[i] * df[pop_col]
    df.columns = df.columns.str.replace('_percentage', '')
    return df

In [66]:
counties_totals= calc_total_pop(counties_totals, percent_vars, counties_pop_col)

### Totals- broken down by race

In [67]:
race_pop_col = 'county_population'
race_percentage_vars = [ 'population_percentage', 'child_poverty_percentage']
racial_breakdown_total = counties_race.copy()

In [68]:
counties_race_totals = calc_total_pop(racial_breakdown_total, race_percentage_vars, race_pop_col)

Note: Many missing values

## Merge for Analysis

In [69]:
fips_codes['fips'] = fips_codes['fips'].astype('str')
index = ['state_abbr', 'county', 'fips']

In [70]:
fips_codes.reset_index(inplace=True)
pol_party_2020.reset_index(inplace=True) #state_abbr
counties.reset_index(inplace=True) #state_abbr
# setting common indices

### Merge dataframes using 'state' with 'fact' table ('all_fips') and set indices for uniform id variables

In [71]:
fips_codes.set_index(index)
pol_party_2020.set_index(index)
counties.set_index(index)
pol_party_indexed = pd.merge(pol_party_2020, fips_codes, on=index, how='inner')
counties_indexed = pd.merge(counties, fips_codes, on=index, how='inner')

In [72]:
pol_party_indexed= pol_party_indexed.drop(columns='state')
counties_indexed= counties_indexed.drop(columns='state')

### Merge counties and pol_party df's on common multi-index

In [None]:
counties_pol_party.head()

In [74]:
# function to clean merged tables
def clean_merged(df):
    # Drop duplicate rows in column: 'fips'
    df = df.drop_duplicates(subset='fips')
    df = df[~df['county'].isin(state_names)]
    df = df.round(3)
    return df


In [None]:
counties_pol_party = clean_merged(counties_pol_party)
counties_pol_party.reset_index()

In [77]:
def clean_extra_indices(df):
    # Drop columns: 'index_x_x', 'index_y_x' and 2 other columns

    df = df.drop(columns=['index_x_x', 'index_y_x', 'index_x_y', 'index_y_y'])
    return df

In [78]:
counties_pol_party = clean_extra_indices(counties_pol_party.copy())
counties_pol_party.head()

Unnamed: 0,fips,county,state_abbr,hs_diploma_percentage,unemployed_percentage,population,median_income,school_segration_index,gender_pay_gap,uninsured__percentage,...,residential_segregation,premature_death,income_inequality,child_poverty_percentage,life_expectancy,child_mortality,high_housing_cost_percentage,firearm_fatalities,child_care_cost_burden,political_party
0,1001,Autauga County,AL,0.903,0.022,60342.0,68857.0,0.058,0.707,0.082,...,33.959,9938.263,4.274,0.17,74.801,61.256,0.117,17.454,0.218,R
1,1003,Baldwin County,AL,0.917,0.023,253507.0,74248.0,0.086,0.722,0.102,...,42.653,8957.113,4.36,0.141,76.58,50.158,0.101,14.962,0.194,R
2,1005,Barbour County,AL,0.778,0.044,24585.0,45298.0,0.036,0.77,0.121,...,22.729,12738.656,5.614,0.348,72.709,71.707,0.095,26.65,0.286,R
3,1007,Bibb County,AL,0.803,0.025,21868.0,56025.0,0.09,0.823,0.108,...,40.01,11708.948,5.347,0.214,72.977,87.077,0.091,21.542,0.266,R
4,1009,Blount County,AL,0.827,0.021,59816.0,64962.0,0.094,0.813,0.125,...,64.886,11898.088,4.62,0.166,72.936,58.036,0.075,18.145,0.253,R


## Inegrate w race categories

Subset merged df with variables for aggregate data (not contained in race dataframe) to avoid duplicate columns

In [None]:
counties_race.reset_index()
test_agg = pol_party_counties[agg_vars]
counties_race.set_index(['state_abbr', 'county', 'fips','race'])
test_agg.set_index(index)

Unnamed: 0,state_abbr,county,fips,race,county_population,child_mortality_x,child_poverty_percentage_x,firearm_fatalities_x,life_expectancy_x,median_income_x,...,residential_segregation,premature_death_y,income_inequality,child_poverty_percentage_y,life_expectancy_y,child_mortality_y,high_housing_cost_percentage,firearm_fatalities_y,child_care_cost_burden,political_party
0,AK,Aleutians East Borough,2013,aian,3461.0,,0.263,,,52237.0,...,,4200.337,4.733,0.186,,,0.085,,0.216,R
1,AK,Aleutians East Borough,2013,asian,3461.0,,,,,96726.0,...,,4200.337,4.733,0.186,,,0.085,,0.216,R
2,AK,Aleutians East Borough,2013,black,3461.0,,,,,,...,,4200.337,4.733,0.186,,,0.085,,0.216,R
3,AK,Aleutians East Borough,2013,hispanic,3461.0,,,,,81818.0,...,,4200.337,4.733,0.186,,,0.085,,0.216,R
4,AK,Aleutians East Borough,2013,nhopi,3461.0,,,,,,...,,4200.337,4.733,0.186,,,0.085,,0.216,R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18853,WY,Weston County,56045,asian,6808.0,,,,,,...,,6754.810,4.174,0.140,77.999,,0.096,,0.231,R
18854,WY,Weston County,56045,black,6808.0,,,,,,...,,6754.810,4.174,0.140,77.999,,0.096,,0.231,R
18855,WY,Weston County,56045,hispanic,6808.0,,,,,,...,,6754.810,4.174,0.140,77.999,,0.096,,0.231,R
18856,WY,Weston County,56045,nhopi,6808.0,,,,,,...,,6754.810,4.174,0.140,77.999,,0.096,,0.231,R


In [None]:
all_sets= counties_race.merge(test_agg, how='left', on=index)
all_sets.set_index(['state_abbr', 'county', 'fips','race'], inplace=True)

In [None]:
all_sets.reset_index(inplace=True)
all_sets['race'] = all_sets['race'].astype('category')

### Prep hierachical dataframe for anlalysis
1. data imputation for missing values
2. standardization

In [None]:
# drop missing population values- indicates no county reporting for racial group
# too much missing data for imputation to entire seet- median income has highest reporting for racial groups
all_sets = all_sets.dropna(subset=['population','median_income'])


In [123]:
# dropping colums w more than half of records missing
all_sets = all_sets.drop(columns=['firearm_fatalities', 'child_mortality'])

In [124]:
all_sets.reset_index()

Unnamed: 0,index,state_abbr,county,fips,race,county_population,child_poverty_percentage,life_expectancy,median_income,population_percentage,premature_death,hs_diploma_percentage,unemployed_percentage,population,uninsured__percentage,high_housing_cost_percentage,child_care_cost_burden,political_party
0,0,AK,Aleutians East Borough,2013,aian,3461.0,0.263,,52237.0,0.133,32661.178,0.839,0.021,3461.0,0.196,0.085,0.216,R
1,1,AK,Aleutians East Borough,2013,asian,3461.0,,,96726.0,0.442,,0.839,0.021,3461.0,0.196,0.085,0.216,R
2,3,AK,Aleutians East Borough,2013,hispanic,3461.0,,,81818.0,0.207,,0.839,0.021,3461.0,0.196,0.085,0.216,R
3,5,AK,Aleutians East Borough,2013,white,3461.0,,,98250.0,0.096,,0.839,0.021,3461.0,0.196,0.085,0.216,R
4,6,AK,Aleutians West Census Area,2016,aian,5160.0,0.124,,72500.0,0.082,33069.552,0.905,0.030,5160.0,0.167,0.036,0.312,R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9797,18843,WY,Uinta County,56041,hispanic,20745.0,0.039,81.762,77576.0,0.102,,0.946,0.032,20745.0,0.137,0.062,0.176,R
9798,18845,WY,Uinta County,56041,white,20745.0,0.068,74.738,83964.0,0.864,10820.292,0.946,0.032,20745.0,0.137,0.062,0.176,R
9799,18849,WY,Washakie County,56043,hispanic,7710.0,0.139,,38482.0,0.141,,0.914,0.033,7710.0,0.150,0.068,0.270,R
9800,18851,WY,Washakie County,56043,white,7710.0,0.123,76.417,64919.0,0.821,8911.662,0.914,0.033,7710.0,0.150,0.068,0.270,R


In [125]:
all_sets.describe()

Unnamed: 0,county_population,child_poverty_percentage,life_expectancy,median_income,population_percentage,premature_death,hs_diploma_percentage,unemployed_percentage,population,uninsured__percentage,high_housing_cost_percentage,child_care_cost_burden
count,9802.0,7280.0,6286.0,9802.0,9802.0,6237.0,9802.0,9802.0,9802.0,9802.0,9802.0,9801.0
mean,154054.8,0.226421,76.947166,68040.000306,0.304052,11031.708443,0.890253,0.035973,154054.8,0.103463,0.105316,0.287607
std,411358.6,0.161533,7.197955,28651.737074,0.343426,6092.371971,0.051155,0.01183,411358.6,0.045485,0.03403,0.070374
min,217.0,0.002,51.092,4189.0,0.001,1441.206,0.571,0.01,217.0,0.021,0.0,0.123
25%,16924.0,0.108,72.32525,50505.25,0.024,7038.826,0.862,0.028,16924.0,0.068,0.083,0.236
50%,41824.0,0.185,76.0175,63752.0,0.1,9860.561,0.901,0.034,41824.0,0.0935,0.1,0.28
75%,121939.0,0.306,80.36625,80000.0,0.624,13572.957,0.928,0.041,121939.0,0.131,0.124,0.332
max,9663345.0,0.99,143.311,249138.0,0.975,81499.082,1.0,0.173,9663345.0,0.348,0.295,0.652


In [128]:
print(all_sets.groupby('race').describe())

         county_population                                                  \
                     count           mean            std     min       25%   
race                                                                         
aian                1281.0  209084.852459  497004.596208   765.0  24409.00   
asian               1291.0  225496.721146  492683.668053  1095.0  36305.50   
black               1752.0  172955.369863  432105.235922  1294.0  21690.75   
hispanic            2370.0  134355.617722  377424.233239   568.0  15496.75   
nhopi                  0.0            NaN            NaN     NaN       NaN   
white               3108.0  106065.135457  333518.385221   217.0  10854.00   

                                        child_poverty_percentage            \
              50%        75%        max                    count      mean   
race                                                                         
aian      59757.0  183215.00  9663345.0                    678.

### Data Imputation

# Insights

## Standardizing and preparing for variable selection

In [None]:
independent_vars = ['income_inequality', 'gender_pay_gap',
       'school_segration_index', 'residential_segregation',
       'school_funding_gap']

dependent_vars = ['hs_diploma_percentage', 'unemployed_percentage', 'population', 'median_income', 'uninsured__percentage', 'premature_death', 'child_poverty_percentage', 'life_expectancy', 'child_mortality', 'high_housing_cost_percentage', 'firearm_fatalities', 'child_care_cost_burden']
vars_to_z = independent_vars + dependent_vars
#standardize_frame = counties[vars_to_z]
standardized_frame = counties.dropna()

In [None]:
# multiplies each percentage column by county polulation to yield total amounts
def calc_z(df, cols):
    for i in df[cols]:
        df[i] = np.abs(stats.zscore(df[i])) 
    return df


In [None]:
z_counties = calc_z(standardized_frame, vars_to_z)

## Mutual Information: finding highly dependent Vartiables

### Creating a Matrix of pairs fo each dependent var w independent var

In [None]:
# Initialize a matrix to store mutual information
mutual_info_matrix = np.zeros((len(independent_vars), len(dependent_vars)))

# Calculate mutual information for each independent-dependent variable pair
for i, var1 in enumerate(independent_vars):
    for j, var2 in enumerate(dependent_vars):
        X = z_counties[[var1]]
        y = z_counties[var2]
        mutual_info = mutual_info_regression(X, y)
        mutual_info_matrix[i, j] = mutual_info[0]  # Store the mutual info scalar

# Convert the matrix to a DataFrame for easier viewing
mutual_info_df = pd.DataFrame(mutual_info_matrix, index=independent_vars, columns=dependent_vars)

# Display the mutual information DataFrame
print(mutual_info_df)


### Visualizing Pairwise Mutual Information as Heatmap

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(mutual_info_df, annot=True, cmap='flare', square=True)
plt.title('Pairwise Mutual Information- Inequity')
plt.show()

### Getting top values of dependent pairs

In [None]:
top_var_pairs = mutual_info_df.unstack().sort_values(ascending=False).head(15)
print(top_var_pairs)

### Data Description

In [None]:
z_counties.describe()

### Correlation and Covariance

In [None]:
matrix = counties.corr(method='spearman', numeric_only=True).round(2)
sns.heatmap(matrix, annot=True, cmap='flare')

In [None]:
sns.pairplot(counties, 
    x_vars= ['income_inequality', 'gender_pay_gap', 'school_segration_index', 'residential_segregation','school_funding_gap'],
    y_vars= ['child_mortality', 'child_poverty_percentage','firearm_fatalities', 'life_expectancy', 'median_income', 'premature_death'],
    )     

### Distributions

#### Income Inequality

In [None]:
inequity_df = z_counties[inequity_vars]
inequity_df.hist(figsize=(12,12), layout=(3,3), color='teal', ec='black', sharex=False);

**Note**: note normally distributed

#### Social Indicators

In [None]:
aggregate_outcomes_dist = aggregate_outcomes.drop(columns='population')
aggregate_outcomes_dist.hist(figsize=(12,12), layout=(3,3), sharex=False);

plt.show()

## Regression

In [None]:
fig, ax = plt.subplots()  
ax.scatter(us_counties_updated['income_inequality'], us_counties_updated['child_mortality'])  
ax.set_xlabel('Income Inequality Index')  
ax.set_ylabel('Child Mortality')  
plt.show()

## Drill down on race

### Pair Plots

In [None]:
race_anlysis = counties_race.copy()
race_anlysis = counties_race.drop(columns=['county_population', 'population_percentage'])

In [None]:
sns.pairplot(race_anlysis, hue='race')

### Comparing distributions between income inequality groupings: 'above_median' and 'below_median'

In [None]:
print(us_counties_updated['income_inequality'].median())