In [2]:
"""
PROJECT 4: STATE-LEVEL MATERNAL MORTALITY ANALYSIS (2018-2024)

Author: Marshawn Shelton, MPH, PMP, CCMP
Date: November 13, 2025
Part of: OE-3PI Framework Research Program

Research Questions:
1. Which states have worst maternal mortality?
2. Which states have biggest disparities?
3. Did 2023-2024 show improvement from 2021 peak?
4. Which states recovered fastest post-COVID?
5. Do policies correlate with outcomes?

This analysis will validate the OE-3PI Framework hypothesis:
States with operational infrastructure will show better outcomes
than states with policy coverage alone.
"""

# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# New visualization tools
import plotly.express as px
import plotly.graph_objects as go

# Import reusable functions from Project 2
import sys
sys.path.append('/Users/marshawnshelton/Documents/equity-metrics-dashboard')
from src.analysis import (
    load_maternal_data, 
    load_birth_data, 
    create_race_ethnicity,
    calculate_mmr,
    create_disparity_ratios,
    visualize_disparities
)

# Settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8-darkgrid')

print("‚úÖ All imports successful!")
print("‚úÖ Ready to download and analyze 2023-2024 data!")

‚úÖ All imports successful!
‚úÖ Ready to download and analyze 2023-2024 data!


In [4]:
# Load the state-level data (2018-2023)
print("=" * 80)
print("LOADING 2018-2023 STATE-LEVEL DATA")
print("This includes 2023 - our first look at post-policy outcomes!")
print("=" * 80)

# Use absolute paths to be sure
base_path = '/Users/marshawnshelton/Documents/equity-metrics-dashboard'

mortality_state = load_maternal_data(f'{base_path}/data/raw/maternal_mortality_state_2018_2023.txt')
births_state = load_birth_data(f'{base_path}/data/raw/births_state_2018_2023.txt')

print(f"\nüìä DATA SUMMARY:")
print(f"{'='*60}")
print(f"States in dataset: {mortality_state['State'].nunique()}")
print(f"Years available: {sorted(mortality_state['Year'].unique())}")
print(f"Total deaths 2018-2023: {mortality_state['Deaths'].sum():,.0f}")
print(f"Total births 2018-2023: {births_state['Births'].sum():,.0f}")

# Check for 2023 data specifically
deaths_2023 = mortality_state[mortality_state['Year'] == 2023]['Deaths'].sum()
births_2023 = births_state[births_state['Year'] == 2023]['Births'].sum()

print(f"\nüéØ 2023 SPECIFIC DATA (Post-Medicaid Extension):")
print(f"{'='*60}")
print(f"2023 Deaths: {deaths_2023:,.0f}")
print(f"2023 Births: {births_2023:,.0f}")

# Quick peek at the data structure
print(f"\nüìã Sample of mortality data:")
print(mortality_state.head(10))

LOADING 2018-2023 STATE-LEVEL DATA
This includes 2023 - our first look at post-policy outcomes!
‚úÖ Loaded 5,661 rows of maternal mortality data
‚úÖ Loaded 7,381 rows of birth data
   Total births: 22,077,525

üìä DATA SUMMARY:
States in dataset: 51
Years available: [np.float64(2018.0), np.float64(2019.0), np.float64(2020.0), np.float64(2021.0), np.float64(2022.0), np.float64(2023.0), np.float64(nan)]
Total deaths 2018-2023: 5,121
Total births 2018-2023: 22,077,525

üéØ 2023 SPECIFIC DATA (Post-Medicaid Extension):
2023 Deaths: 679
2023 Births: 3,595,470

üìã Sample of mortality data:
  Notes    State  State Code    Year  Year Code         Hispanic Origin  \
0   NaN  Alabama         1.0  2018.0     2018.0      Hispanic or Latino   
1   NaN  Alabama         1.0  2018.0     2018.0      Hispanic or Latino   
2   NaN  Alabama         1.0  2018.0     2018.0      Hispanic or Latino   
3   NaN  Alabama         1.0  2018.0     2018.0      Hispanic or Latino   
4   NaN  Alabama         1.0  

In [5]:
# Standardize race/ethnicity categories
print("Standardizing race/ethnicity categories...")

mortality_state['race_ethnicity'] = mortality_state.apply(create_race_ethnicity, axis=1)
births_state['race_ethnicity'] = births_state.apply(create_race_ethnicity, axis=1)

# Clean the data - remove any rows with missing Year
mortality_state = mortality_state[mortality_state['Year'].notna()].copy()
births_state = births_state[births_state['Year'].notna()].copy()

print("‚úÖ Race/ethnicity standardized")
print(f"‚úÖ Data cleaned")

# Verify race/ethnicity categories
print(f"\nüìã Race/Ethnicity Categories:")
print(mortality_state['race_ethnicity'].value_counts())

# Verify years
print(f"\nüìÖ Years in dataset:")
print(sorted(mortality_state['Year'].unique()))

Standardizing race/ethnicity categories...
‚úÖ Race/ethnicity standardized
‚úÖ Data cleaned

üìã Race/Ethnicity Categories:
race_ethnicity
Other/Unknown         3686
Hispanic or Latino    1901
Name: count, dtype: int64

üìÖ Years in dataset:
[np.float64(2018.0), np.float64(2019.0), np.float64(2020.0), np.float64(2021.0), np.float64(2022.0), np.float64(2023.0)]


In [7]:
# Check what the State column is actually called
print("Mortality data columns:")
print([col for col in mortality_state.columns if 'state' in col.lower()])

print("\nBirth data columns:")
print([col for col in births_state.columns if 'state' in col.lower()])

Mortality data columns:
['State', 'State Code']

Birth data columns:
['State of Residence', 'State of Residence Code']


In [8]:
# Rename births column to match mortality data
births_state = births_state.rename(columns={'State of Residence': 'State'})

print("‚úÖ Column names standardized!")
print(f"\nMortality columns: {[col for col in mortality_state.columns if 'State' in col]}")
print(f"Birth columns: {[col for col in births_state.columns if 'State' in col]}")

‚úÖ Column names standardized!

Mortality columns: ['State', 'State Code']
Birth columns: ['State', 'State of Residence Code']


In [9]:
# Calculate state-level MMRs (overall, all races combined)
print("=" * 80)
print("CALCULATING STATE-LEVEL MATERNAL MORTALITY RATES")
print("=" * 80)

state_mmr_overall = calculate_mmr(
    mortality_state,
    births_state,
    group_by=['State']
)

print(f"\nüèÜ TOP 10 BEST STATES (Lowest MMR 2018-2023):")
print("=" * 60)
print(state_mmr_overall.head(10)[['State', 'Deaths', 'Births', 'MMR']])

print(f"\n‚ö†Ô∏è  TOP 10 WORST STATES (Highest MMR 2018-2023):")
print("=" * 60)
print(state_mmr_overall.tail(10)[['State', 'Deaths', 'Births', 'MMR']])

CALCULATING STATE-LEVEL MATERNAL MORTALITY RATES
‚úÖ Calculated MMR for 51 groups
   Total deaths: 5,121
   Total births: 22,077,525

üèÜ TOP 10 BEST STATES (Lowest MMR 2018-2023):
            State  Deaths     Births        MMR
0         Alabama   169.0   348033.0  48.558614
1       Tennessee   229.0   486847.0  47.037365
2         Georgia   344.0   750288.0  45.849061
3  North Carolina   294.0   716438.0  41.036349
4       Louisiana   133.0   344713.0  38.582821
5           Texas   800.0  2275653.0  35.154745
6        Kentucky   110.0   315139.0  34.905232
7     Mississippi    72.0   213303.0  33.754800
8        Virginia   192.0   576086.0  33.328357
9            Ohio   247.0   783661.0  31.518731

‚ö†Ô∏è  TOP 10 WORST STATES (Highest MMR 2018-2023):
                   State  Deaths    Births  MMR
41                Nevada     0.0  203041.0  0.0
42              Nebraska     0.0  147556.0  0.0
43               Montana     0.0   66748.0  0.0
44                Alaska     0.0   57021.0  

In [10]:
# Let's look at the data more carefully
print("Checking for data quality issues...\n")

# How many states have zero deaths?
zero_death_states = state_mmr_overall[state_mmr_overall['Deaths'] == 0]
print(f"States with 0 deaths (likely suppressed data): {len(zero_death_states)}")
print(zero_death_states['State'].tolist())

# Show states with actual data
valid_states = state_mmr_overall[state_mmr_overall['Deaths'] > 0].copy()
print(f"\nStates with valid data: {len(valid_states)}")

# Re-sort to see actual best and worst
print(f"\nüèÜ TOP 10 BEST STATES (Lowest MMR - Valid Data Only):")
print("=" * 60)
print(valid_states.nsmallest(10, 'MMR')[['State', 'Deaths', 'Births', 'MMR']])

print(f"\n‚ö†Ô∏è  TOP 10 WORST STATES (Highest MMR - Valid Data Only):")
print("=" * 60)
print(valid_states.nlargest(10, 'MMR')[['State', 'Deaths', 'Births', 'MMR']])

Checking for data quality issues...

States with 0 deaths (likely suppressed data): 17
['South Dakota', 'Vermont', 'Rhode Island', 'Delaware', 'North Dakota', 'Connecticut', 'New Hampshire', 'Nevada', 'Nebraska', 'Montana', 'Alaska', 'District of Columbia', 'Maine', 'Hawaii', 'Idaho', 'Kansas', 'Wyoming']

States with valid data: 34

üèÜ TOP 10 BEST STATES (Lowest MMR - Valid Data Only):
            State  Deaths    Births        MMR
33      Minnesota    13.0  386921.0   3.359859
32       Colorado    13.0  374055.0   3.475425
31           Utah    10.0  277178.0   3.607790
30       Oklahoma    12.0  291119.0   4.122026
29  Massachusetts    21.0  409426.0   5.129132
28           Iowa    14.0  220914.0   6.337308
27     New Mexico    10.0  131766.0   7.589211
26  West Virginia    10.0  104311.0   9.586717
25      Wisconsin    36.0  369452.0   9.744162
24     Washington    53.0  502242.0  10.552682

‚ö†Ô∏è  TOP 10 WORST STATES (Highest MMR - Valid Data Only):
            State  Deaths    

In [11]:
# Compare 2021 (peak) vs 2023 (post-policy)
print("=" * 80)
print("COMPARING COVID PEAK (2021) vs POST-POLICY (2023)")
print("=" * 80)

# Calculate MMR by year
mmr_by_year = calculate_mmr(
    mortality_state,
    births_state,
    group_by=['Year']
)

print("\nüìä National MMR by Year:")
print(mmr_by_year[['Year', 'Deaths', 'Births', 'MMR']])

# Calculate percent change 2021 ‚Üí 2023
mmr_2021 = mmr_by_year[mmr_by_year['Year'] == 2021]['MMR'].values[0]
mmr_2023 = mmr_by_year[mmr_by_year['Year'] == 2023]['MMR'].values[0]
pct_change = ((mmr_2023 - mmr_2021) / mmr_2021) * 100

print(f"\nüéØ KEY FINDING:")
print(f"2021 (Peak): {mmr_2021:.1f} per 100K")
print(f"2023 (Post-Policy): {mmr_2023:.1f} per 100K")
print(f"Change: {pct_change:+.1f}%")

if pct_change < 0:
    print(f"‚úÖ IMPROVEMENT of {abs(pct_change):.1f}%")
else:
    print(f"‚ö†Ô∏è  INCREASE of {pct_change:.1f}%")

COMPARING COVID PEAK (2021) vs POST-POLICY (2023)
‚úÖ Calculated MMR for 6 groups
   Total deaths: 5,121
   Total births: 22,077,525

üìä National MMR by Year:
     Year  Deaths     Births        MMR
0  2021.0  1260.0  3663729.0  34.391190
1  2020.0   917.0  3613047.0  25.380240
2  2022.0   886.0  3667193.0  24.160168
3  2019.0   730.0  3746972.0  19.482398
4  2023.0   679.0  3595470.0  18.884875
5  2018.0   649.0  3791114.0  17.118979

üéØ KEY FINDING:
2021 (Peak): 34.4 per 100K
2023 (Post-Policy): 18.9 per 100K
Change: -45.1%
‚úÖ IMPROVEMENT of 45.1%


In [12]:
# Calculate state MMR by year to see who recovered best
print("=" * 80)
print("STATE RECOVERY ANALYSIS: 2021 ‚Üí 2023")
print("=" * 80)

# Calculate MMR by state and year
state_year_mmr = calculate_mmr(
    mortality_state,
    births_state,
    group_by=['State', 'Year']
)

# Get 2021 and 2023 data for each state
mmr_2021 = state_year_mmr[state_year_mmr['Year'] == 2021][['State', 'MMR']].rename(columns={'MMR': 'MMR_2021'})
mmr_2023 = state_year_mmr[state_year_mmr['Year'] == 2023][['State', 'MMR']].rename(columns={'MMR': 'MMR_2023'})

# Merge
state_change = mmr_2021.merge(mmr_2023, on='State', how='inner')

# Calculate change
state_change['Change'] = state_change['MMR_2023'] - state_change['MMR_2021']
state_change['Pct_Change'] = (state_change['Change'] / state_change['MMR_2021']) * 100

# Remove states with zero (suppressed data)
state_change = state_change[(state_change['MMR_2021'] > 0) & (state_change['MMR_2023'] > 0)]

# Sort by improvement
state_change_sorted = state_change.sort_values('Pct_Change')

print(f"\nüèÜ TOP 10 STATES WITH BIGGEST IMPROVEMENT (2021 ‚Üí 2023):")
print("=" * 70)
print(state_change_sorted.head(10).to_string(index=False))

print(f"\n‚ö†Ô∏è  TOP 10 STATES WITH WORST PERFORMANCE (2021 ‚Üí 2023):")
print("=" * 70)
print(state_change_sorted.tail(10).to_string(index=False))

print(f"\nüìä National average improvement: -45.1%")

STATE RECOVERY ANALYSIS: 2021 ‚Üí 2023
‚úÖ Calculated MMR for 306 groups
   Total deaths: 5,121
   Total births: 22,077,525

üèÜ TOP 10 STATES WITH BIGGEST IMPROVEMENT (2021 ‚Üí 2023):
      State  MMR_2021  MMR_2023     Change  Pct_Change
    Alabama 77.524722 24.200100 -53.324622  -68.784022
 New Jersey 39.411973 13.861386 -25.550587  -64.829505
  Louisiana 71.382558 30.955242 -40.427316  -56.634725
       Ohio 47.002258 22.854441 -24.147817  -51.375866
   Virginia 53.224796 25.905338 -27.319459  -51.328442
   Michigan 28.577688 15.133172 -13.444517  -47.045500
   Arkansas 52.846772 28.369599 -24.477173  -46.317253
Mississippi 62.603153 34.833091 -27.770061  -44.358886
      Texas 54.339097 30.416761 -23.922336  -44.024169
    Florida 44.854039 26.197165 -18.656874  -41.594635

‚ö†Ô∏è  TOP 10 STATES WITH WORST PERFORMANCE (2021 ‚Üí 2023):
         State  MMR_2021  MMR_2023     Change  Pct_Change
    California 21.397596 12.996491  -8.401105  -39.261910
     Tennessee 68.539257 42.16

In [13]:
# Save the state-level analysis
print("Saving state-level data for Project 5 analysis...")

# Save overall state MMRs
state_mmr_overall.to_csv(f'{base_path}/data/processed/state_mmr_2018_2023.csv', index=False)

# Save state-year MMRs
state_year_mmr.to_csv(f'{base_path}/data/processed/state_year_mmr_2018_2023.csv', index=False)

# Save state recovery analysis
state_change_sorted.to_csv(f'{base_path}/data/processed/state_recovery_2021_2023.csv', index=False)

print("‚úÖ Data saved to data/processed/")
print("\nFiles created:")
print("  - state_mmr_2018_2023.csv (overall state rankings)")
print("  - state_year_mmr_2018_2023.csv (state trends over time)")
print("  - state_recovery_2021_2023.csv (2021‚Üí2023 changes)")

Saving state-level data for Project 5 analysis...
‚úÖ Data saved to data/processed/

Files created:
  - state_mmr_2018_2023.csv (overall state rankings)
  - state_year_mmr_2018_2023.csv (state trends over time)
  - state_recovery_2021_2023.csv (2021‚Üí2023 changes)
