# Fetching Data and Basic Statistics

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv('Health_Data_Cleaned.csv')

In [5]:
df.head()

Unnamed: 0,metric_name,group_name,data_period,est,lci,uci,geo_name,state_abbr,period_type,source_name
0,Binge Drinking,Total,2022,18.6,16.3,20.9,Hawaii,HI,1 year modeled estimate,PLACES
1,Binge Drinking,Total,2022,18.7,16.8,20.6,Honolulu,HI,1 year modeled estimate,PLACES
2,Binge Drinking,Total,2022,17.3,15.1,19.5,Kauai,HI,1 year modeled estimate,PLACES
3,Binge Drinking,Total,2022,17.7,15.6,19.8,Maui,HI,1 year modeled estimate,PLACES
4,Binge Drinking,Total,2022,17.0,14.3,19.7,Auburn,AL,1 year modeled estimate,PLACES


In [None]:
df.columns

In [None]:
df = df[['metric_name', 'data_period', 'est', 'lci', 'uci',
       'geo_name', 'state_abbr', 'period_type', 'source_name']]

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

# Hypothesis Generation and Testing

## 1. Which cities have the highest and lowest rates of chronic health conditions like obesity, diabetes, and high blood pressure?

In [None]:
# 1. Top and Bottom Cities by Obesity Rate
obesity = df[df['metric_name'] == 'Obesity']
top_cities = obesity.sort_values(by='est', ascending=False).head(10)
bottom_cities = obesity.sort_values(by='est', ascending=True).head(10)


In [None]:
sns.barplot(data=top_cities.head(10), x='est', y='geo_name', hue='geo_name', palette='flare', legend=False)

In [None]:
sns.barplot(data=bottom_cities.head(10), x='est', y='geo_name', hue='geo_name', palette="YlOrBr", legend=False)

## 2. Is there a correlation between frequent mental distress and chronic physical health issues like diabetes or cardiovascular disease?

In [None]:
# 2. Correlation: Mental Distress vs Diabetes
mental = df[df['metric_name'] == 'Frequent Mental Distress'][['geo_name', 'est']].rename(columns={'est': 'mental_distress'})
diabetes = df[df['metric_name'] == 'Diabetes'][['geo_name', 'est']].rename(columns={'est': 'diabetes'})

merged = pd.merge(mental, diabetes, on='geo_name')
correlation = merged[['mental_distress', 'diabetes']].corr()

In [None]:
sns.heatmap(correlation)

## 3. Do cities with higher rates of firearm suicides also have higher rates of mental distress or binge drinking?

In [None]:
# 3. Firearm Suicides vs Binge Drinking or Mental Distress
firearm = df[df['metric_name'] == 'Firearm Suicides'][['geo_name', 'est']].rename(columns={'est': 'firearm_suicide'})
binge = df[df['metric_name'] == 'Binge Drinking'][['geo_name', 'est']].rename(columns={'est': 'binge_drinking'})
mental = mental  # from previous

merged = firearm.merge(binge, on='geo_name').merge(mental, on='geo_name')
correlation_matrix = merged.corr()

In [None]:
sns.heatmap(correlation_matrix)

## 4.	What is the geographic distribution of frequent mental distress across states?

In [None]:
# Filter for frequent mental distress
distress_state_avg = df[df['metric_name'] == 'Frequent Mental Distress']

# Group by state and average
state_distress = distress_state_avg.groupby('state_abbr')['est'].mean().reset_index()

# Plot
plt.figure(figsize=(12, 6))
sns.barplot(data=state_distress, x='state_abbr', y='est', hue='est', legend=False, palette='viridis')
plt.title('Average Rate of Frequent Mental Distress by State')
plt.xlabel('State')
plt.ylabel('Frequent Mental Distress Rate (%)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## 5.	Are certain states or regions consistently above or below national averages in preventable death metrics?

In [None]:
premature = df[df['metric_name'] == 'Premature Deaths (All Causes)']
state_avg = premature.groupby('state_abbr')['est'].mean().reset_index()
national_avg = premature['est'].mean()

state_avg['Above_National'] = state_avg['est'] > national_avg

sns.barplot(data=state_avg, x='state_abbr', y='est', hue='Above_National', dodge=False, palette='Set2')

## 6.	Do cities with poor access to dental care also report higher physical distress or chronic disease rates?

In [None]:
dental = df[df['metric_name'] == 'Dental Care'][['geo_name', 'est']].rename(columns={'est': 'dental_care'})
physical = df[df['metric_name'] == 'Frequent Physical Distress'][['geo_name', 'est']].rename(columns={'est': 'physical_distress'})

merged = dental.merge(physical, on='geo_name')
sns.scatterplot(data=merged, x='dental_care', y='physical_distress')
sns.regplot(data=merged, x='dental_care', y='physical_distress', scatter=False, color='red')

## 7.	Do health estimates differ significantly between data sources for similar indicators, and why might that be?

In [None]:
comparison = df.groupby(['metric_name', 'source_name'])['est'].mean().reset_index()
sns.barplot(data=comparison, x='est', y='metric_name', hue='source_name')

## 8.	Is there a significant association between lack of dental care and chronic diseases like diabetes?

In [None]:
# Filter relevant metrics
relevant_metrics = ['Dental Care', 'Diabetes']
filtered = df[df['metric_name'].isin(relevant_metrics)]

# Pivot to wide format for comparison
pivot_df = filtered.pivot_table(index='geo_name', columns='metric_name', values='est').dropna()

# Correlation and scatterplot
corr = pivot_df.corr().loc['Dental Care', 'Diabetes']
print(f"Correlation between lack of Dental Care and Diabetes: {corr:.2f}")

# Visualization
sns.scatterplot(data=pivot_df, x='Dental Care', y='Diabetes')
plt.title(f'Dental Care vs Diabetes Rates by City (r = {corr:.2f})')
plt.xlabel('Lack of Dental Care (%)')
plt.ylabel('Diabetes Prevalence (%)')
plt.grid(True)
plt.tight_layout()
plt.show()


## 9.	What cities have the highest rates of premature deaths from all causes?

In [None]:
# Filter for Premature Deaths metric
premature_deaths = df[
    (df['metric_name'] == 'Premature Deaths (All Causes)')
]

# Sort and get top 15 cities
top_cities = premature_deaths.sort_values(by='est', ascending=False).head(15)

# Visualization
plt.figure(figsize=(10, 6))
sns.barplot(data=top_cities, x='est', y='geo_name', hue='geo_name', legend=False, palette='Reds_r')
plt.title('Top 15 Cities by Premature Death Rates')
plt.xlabel('Premature Death Rate (per 100,000)')
plt.ylabel('City')
plt.tight_layout()
plt.show()


## 10.	Are there health disparities between cities in the same state?

In [None]:
# Filter for a single metric to compare across cities within states
metric = "Obesity"
metric_df = df[df["metric_name"] == metric]

# Plot
plt.figure(figsize=(14, 6))
sns.boxplot(data=metric_df, x="state_abbr", y="est")
plt.title(f"Distribution of {metric} Rates by State")
plt.ylabel("Estimated Rate (%)")
plt.xlabel("State")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [6]:
df['source_name'].value_counts()

PLACES       6893
NVSS MCDD    5658
NVSS ND       734
NJSHAD        131
Name: source_name, dtype: int64