In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os

In [2]:
# Define the folder path
folder_path = r"C:\Users\liuc\Desktop\talent rentention\Healthdata2"

# Get all Excel files
excel_files = sorted([f for f in os.listdir(folder_path) if f.endswith(('.xlsx', '.xls'))])

# Column name mapping dictionary to standardize names
column_mapping = {
    # FIPS, State, County (consistent)
    'FIPS': 'FIPS',
    'State': 'State',
    'County': 'County',
    
    # Deaths/Premature Deaths variations
    'pre mature Deaths': 'Premature Deaths',
    'premature Deaths': 'Premature Deaths',
    'Premature death': 'Premature Deaths',
    '# Deaths': 'Premature Deaths',
    'Deaths': 'Premature Deaths',
    
    # Fair/Poor Health variations
    '% Fair or Poor Health': '% Fair or Poor Health',
    '% Fair/Poor': '% Fair or Poor Health',
    
    # Physically Unhealthy Days
    'Physically Unhealthy Days': 'Physically Unhealthy Days',
    'Average Number of Physically Unhealthy Days': 'Physically Unhealthy Days',
    
    # Mentally Unhealthy Days
    'Mentally Unhealthy Days': 'Mentally Unhealthy Days',
    'Average Number of Mentally Unhealthy Days': 'Mentally Unhealthy Days',
    
    # Low Birth Weight
    '% low birth weight': '% Low Birthweight',
    '% Low birthweight': '% Low Birthweight',
    '% Low Birthweight': '% Low Birthweight',
    '% LBW': '% Low Birthweight',
    
    # Smoking
    '% Smokers': '% Smokers',
    '% Adults Reporting Currently Smoking': '% Smokers',
    
    # Obesity
    '% Obese': '% Adults with Obesity',
    '% Adults with Obesity': '% Adults with Obesity',
    
    # Food Environment Index (consistent)
    'Food Environment Index': 'Food Environment Index',
    
    # Physical Activity/Exercise Access
    '% With Access': '% With Access to Exercise Opportunities',
    '% With Access exercise': '% With Access to Exercise Opportunities',
    '% With Access to Exercise Opportunities': '% With Access to Exercise Opportunities',
    '% Physically Inactive': '% Physically Inactive',
    
    # Excessive Drinking (consistent)
    '% Excessive Drinking': '% Excessive Drinking',
    
    # Driving Deaths
    '# Alcohol-Impaired Driving Deaths': '# Alcohol-Impaired Driving Deaths',
    '# Driving Deaths': '# Driving Deaths',
    
    # Teen Birth Rate (consistent)
    'Teen Birth Rate': 'Teen Birth Rate',
    
    # Uninsured
    '# Uninsured': '# Uninsured',
    '% Uninsured': '% Uninsured',
    
    # Primary Care Physicians
    'PCP Rate': 'Primary Care Physicians Rate',
    'Primary Care Physicians Ratio': 'Primary Care Physicians Rate',
    'Primary Care Physicians Rate': 'Primary Care Physicians Rate',
    
    # Medicare
    '# Medicare enrollees': '# Medicare Enrollees',
    '# Medicare Enrollees': '# Medicare Enrollees',
    
    # Preventable Hospitalizations
    'Preventable Hosp. Rate': 'Preventable Hospitalization Rate',
    'Preventable Hospitalization Rate': 'Preventable Hospitalization Rate',
    
    # Education
    '% Some College': '% Some College',
    
    # Unemployment
    '% Unemployed': '% Unemployed',
    
    # Child Poverty
    '% Children in Poverty': '% Children in Poverty',
    
    # Income Ratio
    'Income Ratio': 'Income Ratio',
    
    # Single-Parent Households
    '% Single-Parent Households': '% Children in Single-Parent Households',
    '% Children in Single-Parent Households': '% Children in Single-Parent Households',
    
    # Social Association
    'Association Rate': 'Social Association Rate',
    'Social Association Rate': 'Social Association Rate',
    
    # Crime
    'Violent Crime Rate': 'Violent Crime Rate',
    
    # Housing
    '% Severe Housing Problems': '% Severe Housing Problems',
    
    # Commuting
    '% Drive Alone': '% Drive Alone to Work',
    '% Drive Alone to Work': '% Drive Alone to Work',
    'Long Commute - Drives Alone': '% Long Commute - Drives Alone',
    '% Long Commute - Drives Alone': '% Long Commute - Drives Alone',
    
    # Dentist columns (2023 specific)
    'Quartile': 'Quartile',
    '# Dentists': '# Dentists',
    'Dentist Rate': 'Dentist Rate',
    'Dentist Ratio': 'Dentist Ratio'
}

# List to store all dataframes
all_dfs = []

# Process each file
for file in excel_files:
    file_path = os.path.join(folder_path, file)
    
    try:
        # Extract year from filename
        if '2014' in file:
            year = 2014
        elif '2015' in file:
            year = 2015
        elif '2016' in file:
            year = 2016
        elif '2017' in file:
            year = 2017
        elif '2018' in file:
            year = 2018
        elif '2019' in file:
            year = 2019
        elif '2020' in file:
            year = 2020
        elif '2021' in file:
            year = 2021
        elif '2022' in file:
            year = 2022
        elif '2023' in file:
            year = 2023
        elif '2024' in file:
            year = 2024
        else:
            year = 'Unknown'
        
        # Read the Excel file
        df = pd.read_excel(file_path, sheet_name='Ranked Measure Data')
        
        # Add year column
        df['Year'] = year
        
        # Rename columns using the mapping
        df = df.rename(columns=column_mapping)
        
        # Add to list
        all_dfs.append(df)
        
        print(f"Processed {file}: {len(df)} rows, Year: {year}")
        
    except Exception as e:
        print(f"Error processing {file}: {str(e)}")

# Get all unique columns across all dataframes
all_columns = set()
for df in all_dfs:
    all_columns.update(df.columns)

# Remove 'Year' from all_columns as we'll add it at a specific position
all_columns.discard('Year')

# Define the order of columns (put most important ones first)
column_order = ['Year', 'FIPS', 'State', 'County'] + sorted(list(all_columns - {'FIPS', 'State', 'County'}))

# Standardize all dataframes to have the same columns
standardized_dfs = []
for df in all_dfs:
    # Add missing columns with NaN
    for col in column_order:
        if col not in df.columns:
            df[col] = np.nan
    
    # Reorder columns
    df = df[column_order]
    standardized_dfs.append(df)

# Combine all dataframes
combined_df = pd.concat(standardized_dfs, ignore_index=True)

# Sort by Year, State, and County
combined_df = combined_df.sort_values(['Year', 'State', 'County'])


Processed 2014 County Health Rankings Data - v6.xls: 3141 rows, Year: 2014
Processed 2015 County Health Rankings Data - v3.xls: 3141 rows, Year: 2015
Processed 2016 County Health Rankings Data - v3.xls: 3141 rows, Year: 2016
Processed 2017CountyHealthRankingsData.xls: 3136 rows, Year: 2017
Processed 2018 County Health Rankings Data - v2.xls: 3142 rows, Year: 2018
Processed 2019 County Health Rankings Data - v3.xls: 3142 rows, Year: 2019
Processed 2020 County Health Rankings Data - v2.xlsx: 3193 rows, Year: 2020
Processed 2021 County Health Rankings Data - v1.xlsx: 3193 rows, Year: 2021
Processed 2022 County Health Rankings Data - v1.xlsx: 3193 rows, Year: 2022
Processed 2023 County Health Rankings Data - v2.xlsx: 3193 rows, Year: 2023
Processed 2024_county_health_release_data_-_v1 (1).xlsx: 3201 rows, Year: 2024


In [3]:
# Remove rows where County is null
print(f"Original shape: {combined_df.shape}")
print(f"Rows with null County: {combined_df['County'].isna().sum()}")

# Remove rows where County is null
combined_df_cleaned = combined_df[combined_df['County'].notna()].copy()

print(f"After removing null counties: {combined_df_cleaned.shape}")

# Calculate missing percentage for each column
missing_percent = (combined_df_cleaned.isna().sum() / len(combined_df_cleaned)) * 100

# Create a summary of missing data
missing_summary = pd.DataFrame({
    'Column': missing_percent.index,
    'Missing_Count': combined_df_cleaned.isna().sum().values,
    'Total_Rows': len(combined_df_cleaned),
    'Missing_Percentage': missing_percent.values
}).sort_values('Missing_Percentage', ascending=False)

print(f"\n{'='*60}")
print("Columns with missing data:")
print(f"{'='*60}")
print(missing_summary[missing_summary['Missing_Percentage'] > 0])

# Identify columns to keep (less than or equal to 50% missing)
columns_to_keep = missing_percent[missing_percent <= 50].index.tolist()
columns_to_remove = missing_percent[missing_percent > 50].index.tolist()

print(f"\n{'='*60}")
print(f"Columns to remove (>50% missing): {len(columns_to_remove)}")
print(f"{'='*60}")
for col in columns_to_remove:
    print(f"  - {col}: {missing_percent[col]:.1f}% missing")

# Keep only columns with <=50% missing data
combined_df_cleaned = combined_df_cleaned[columns_to_keep]

print(f"\n{'='*60}")
print("Final cleaned dataset summary:")
print(f"{'='*60}")
print(f"Shape: {combined_df_cleaned.shape}")
print(f"Rows: {len(combined_df_cleaned)}")
print(f"Columns: {len(combined_df_cleaned.columns)}")
print(f"\nRemaining columns ({len(combined_df_cleaned.columns)}):")
for i, col in enumerate(combined_df_cleaned.columns, 1):
    non_missing = combined_df_cleaned[col].notna().sum()
    pct_available = (non_missing / len(combined_df_cleaned)) * 100
    print(f"  {i:2d}. {col:<50} ({pct_available:.1f}% data available)")


Original shape: (34816, 38)
Rows with null County: 255
After removing null counties: (34561, 38)

Columns with missing data:
                                     Column  Missing_Count  Total_Rows  \
25                             Dentist Rate          31505       34561   
5                                # Dentists          31505       34561   
26                            Dentist Ratio          31505       34561   
23                  % With Access exercise           31498       34561   
34                                 Quartile          31479       34561   
17                    % Physically Inactive          31419       34561   
8                               # Uninsured          25139       34561   
7                      # Medicare Enrollees          19499       34561   
31                         Premature Deaths           8309       34561   
37                       Violent Crime Rate           7921       34561   
32         Preventable Hospitalization Rate           3913   

In [4]:
# Read the QWI data
qwi_path = r"C:\Users\liuc\Downloads\qwi_47eb6eafd6f449ccbab042fc81879bc1.csv"
qwi_df = pd.read_csv(qwi_path)

print("QWI Data Shape:", qwi_df.shape)
print(f"\nUnique years: {sorted(qwi_df['year'].unique())}")
print(f"Number of unique counties (FIPS): {qwi_df['geography'].nunique()}")
print(f"Number of unique industries: {qwi_df['industry'].nunique()}")

# Handle FIPS codes
qwi_df['FIPS'] = qwi_df['geography'].astype(str).str.zfill(5)

# Handle suppressed data
value_cols = ['EarnBeg', 'Emp', 'HirA']
suppression_cols = ['sEarnBeg', 'sEmp', 'sHirA']

for val_col, supp_col in zip(value_cols, suppression_cols):
    qwi_df.loc[qwi_df[supp_col] == 5, val_col] = np.nan
    print(f"Suppressed {val_col}: {(qwi_df[supp_col] == 5).sum()} records")

# Create quarterly version of health data
quarters = [1, 2, 3, 4]
health_quarterly_list = []

for quarter in quarters:
    health_q = combined_df_cleaned.copy()
    health_q['quarter'] = quarter
    health_quarterly_list.append(health_q)

health_quarterly = pd.concat(health_quarterly_list, ignore_index=True)

print(f"\nOriginal health data shape: {combined_df_cleaned.shape}")
print(f"Quarterly health data shape: {health_quarterly.shape}")

# Ensure FIPS formatting in health data
health_quarterly['FIPS'] = health_quarterly['FIPS'].astype(str).str.zfill(5)

# Merge - note the lowercase 'year' in QWI data
merged_df = pd.merge(
    health_quarterly,
    qwi_df,
    left_on=['FIPS', 'Year', 'quarter'],
    right_on=['FIPS', 'year', 'quarter'],
    how='inner',
    suffixes=('_health', '_qwi')
)

print(f"\nMerged data shape: {merged_df.shape}")
print(f"Number of columns: {len(merged_df.columns)}")

# Check merge quality
print("\n=== MERGE QUALITY CHECK ===")
print(f"Unique FIPS in health data: {health_quarterly['FIPS'].nunique()}")
print(f"Unique FIPS in QWI data: {qwi_df['FIPS'].nunique()}")
print(f"Unique FIPS in merged data: {merged_df['FIPS'].nunique()}")

# Check industries in merged data
print(f"\nRecords per industry in merged data:")
print(merged_df['industry'].value_counts().head())

# Check year coverage
print(f"\nYear coverage in merged data:")
print(merged_df['Year'].value_counts().sort_index())

# Basic statistics on workforce metrics
print("\n=== WORKFORCE METRICS SUMMARY ===")
workforce_metrics = merged_df.groupby('industry')[['Emp', 'EarnBeg', 'HirA']].agg(['mean', 'median', 'count'])
print(workforce_metrics.head())

# Check how much data we retained
retention_rate = (merged_df['FIPS'].nunique() / health_quarterly['FIPS'].nunique()) * 100
print(f"\nCounty retention rate: {retention_rate:.1f}%")

print(f"\nFinal dataset: {len(merged_df):,} records")
print(f"Unique county-year-quarter-industry combinations")

QWI Data Shape: (2642015, 24)

Unique years: [2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
Number of unique counties (FIPS): 3222
Number of unique industries: 21
Suppressed EarnBeg: 11305 records
Suppressed Emp: 259199 records
Suppressed HirA: 287103 records

Original health data shape: (34561, 30)
Quarterly health data shape: (138244, 31)

Merged data shape: (2573811, 54)
Number of columns: 54

=== MERGE QUALITY CHECK ===
Unique FIPS in health data: 3150
Unique FIPS in QWI data: 3222
Unique FIPS in merged data: 3127

Records per industry in merged data:
industry
00       135584
44-45    135348
23       134964
62       134904
72       134797
Name: count, dtype: int64

Year coverage in merged data:
Year
2014    236893
2015    236832
2016    236126
2017    235371
2018    235487
2019    235625
2020    235622
2021    235941
2022    230093
2023    230257
2024    225564
Name: count, dtype: int64

=== WORKFORCE METRICS SUMMARY ===
                   Emp                   

In [5]:
health_columns = [col for col in combined_df_cleaned.columns if col != 'quarter']

# From QWI data: only the specific columns you mentioned
qwi_columns = ['quarter', 'EarnBeg', 'Emp', 'HirA', 'sEarnBeg', 'sEmp', 'sHirA', 'industry']

# Combine the column lists
columns_to_keep = health_columns + qwi_columns

# Filter the merged dataframe
merged_df_filtered = merged_df[columns_to_keep].copy()

In [6]:
# Define mapping
industry_mapping = {
    '00': 'Total, All Industries',
    '11': 'Agriculture, Forestry, Fishing, and Hunting',
    '21': 'Mining, Quarrying, and Oil and Gas Extraction',
    '22': 'Utilities',
    '23': 'Construction',
    '31-33': 'Manufacturing',
    '42': 'Wholesale Trade',
    '44-45': 'Retail Trade',
    '48-49': 'Transportation and Warehousing',
    '51': 'Information',
    '52': 'Finance and Insurance',
    '53': 'Real Estate and Rental and Leasing',
    '54': 'Professional, Scientific, and Technical Services',
    '55': 'Management of Companies and Enterprises',
    '56': 'Administrative and Support and Waste Management Services',
    '61': 'Educational Services',
    '62': 'Health Care and Social Assistance',
    '71': 'Arts, Entertainment, and Recreation',
    '72': 'Accommodation and Food Services',
    '81': 'Other Services (except Public Administration)',
    '92': 'Public Administration'
}

# Add readable industry name
merged_df_filtered['industry_name'] = merged_df_filtered['industry'].map(industry_mapping)

merged_df_filtered = merged_df_filtered.rename(columns={
    'EarnBeg': 'Earnings_Beginning_Qtr',
    'Emp': 'Employment_Count',
    'HirA': 'New_Hires',
    'sEarnBeg': 'Earnings_Suppression_Flag',
    'sEmp': 'Employment_Suppression_Flag',
    'sHirA': 'Hires_Suppression_Flag'
})


In [7]:
population_path = r"C:\Users\liuc\Downloads\Population by Age and Sex - US, States, Counties.csv"
df_population = pd.read_csv(population_path)


In [8]:
# Ensure Year is int
merged_df_filtered['Year'] = merged_df_filtered['Year'].astype(int)
df_population_unique = (
    df_population
    .sort_values(['Statefips','Countyfips','Year'])
    .drop_duplicates(subset=['Statefips','Countyfips','Year'], keep='first')
)


df_population_unique['FIPS'] = (
    df_population_unique['Statefips'].astype(str).str.zfill(2) +
    df_population_unique['Countyfips'].astype(str).str.zfill(3)
)

df_pop_small = df_population_unique[['FIPS','Year','Total Population']].rename(
    columns={'Total Population':'Population'}
)

merged_full = merged_df_filtered.merge(
    df_pop_small, on=['FIPS','Year'], how='left'
)



In [9]:
import numpy as np
import pandas as pd

# ---------------------------------------------------------
# CONFIG
# ---------------------------------------------------------
health_count_cols = [
    '# Alcohol-Impaired Driving Deaths',
    '# Driving Deaths',
    'Premature Deaths'
]

health_rate_cols = [
    '% Adults with Obesity',
    '% Children in Poverty',
    '% Children in Single-Parent Households',
    '% Drive Alone to Work',
    '% Excessive Drinking',
    '% Fair or Poor Health',
    '% Long Commute - Drives Alone',
    '% Low Birthweight',
    '% Severe Housing Problems',
    '% Smokers',
    '% Some College',
    '% Unemployed',
    '% Uninsured',
    '% With Access to Exercise Opportunities',
    'Food Environment Index',
    'Income Ratio',
    'Mentally Unhealthy Days',
    'Physically Unhealthy Days',
    'Preventable Hospitalization Rate',
    'Primary Care Physicians Rate',
    'Social Association Rate',
    'Teen Birth Rate',
    'Violent Crime Rate'
]

econ_cols = ['Employment_Count', 'New_Hires', 'Earnings_Beginning_Qtr']
flag_map = {
    'Employment_Count': 'Employment_Suppression_Flag',
    'New_Hires': 'Hires_Suppression_Flag',
    'Earnings_Beginning_Qtr': 'Earnings_Suppression_Flag'
}

# Common "total industry" signals (adjust if your data differs)
TOTAL_CODES = {'00', '0', '000', '10'}
TOTAL_NAME_PAT = r'\b(total|all)\b'

# ---------------------------------------------------------
# Helpers
# ---------------------------------------------------------
def to_num_clean(s):
    # robust numeric conversion: removes commas and common suppression tokens
    s = s.astype(str).str.replace(",", "", regex=False).str.strip()
    s = s.replace({
        "": np.nan, "nan": np.nan, "None": np.nan, "NA": np.nan, "N/A": np.nan,
        "*": np.nan, "S": np.nan, "s": np.nan
    })
    return pd.to_numeric(s, errors="coerce")

def sum_mc1(x):
    return x.sum(min_count=1)

def safe_div(n, d):
    n = np.asarray(n, dtype=float)
    d = np.asarray(d, dtype=float)
    return np.where((d == 0) | np.isnan(d), np.nan, n / d)

def weighted_mean(x, w):
    x = np.asarray(x, dtype=float)
    w = np.asarray(w, dtype=float)
    num = np.nansum(x * w)
    den = np.nansum(w)
    return np.nan if den == 0 else num / den

def infer_suppressed_value(df, val_col, flag_col):
    """
    Tries to infer which flag value indicates suppression by checking which flag value
    is most associated with missing/low coverage in val_col.
    Returns a set of values to treat as 'suppressed'. If ambiguous, returns empty set.
    """
    if flag_col not in df.columns or val_col not in df.columns:
        return set()

    f = df[flag_col].copy()
    # normalize flag to strings so 1/"1"/True collapse
    f = f.astype(str).str.strip().replace({"nan": np.nan})
    tmp = pd.DataFrame({"flag": f, "val": df[val_col]})

    # only keep flags that appear enough
    vc = tmp["flag"].value_counts(dropna=True)
    cand = vc[vc >= 100].index.tolist()
    if not cand:
        return set()

    stats = []
    for v in cand:
        sub = tmp[tmp["flag"] == v]
        miss = sub["val"].isna().mean()
        nonmiss = 1 - miss
        stats.append((v, nonmiss, miss, len(sub)))

    # pick the flag value with the *lowest* non-missing rate as "suppressed"
    stats_sorted = sorted(stats, key=lambda t: t[1])
    best = stats_sorted[0]
    second = stats_sorted[1] if len(stats_sorted) > 1 else None

    # require separation to avoid guessing wrong
    # (e.g., suppressed has 5% non-missing and next has 95%)
    if second is None or (second[1] - best[1] >= 0.20):
        return {best[0]}
    return set()

# ---------------------------------------------------------
# STEP 0: Pre-processing
# ---------------------------------------------------------
df = merged_full.copy()

# Keep FIPS as string with leading zeros (do NOT numeric-coerce it)
df["FIPS"] = df["FIPS"].astype(str).str.extract(r"(\d+)", expand=False)
df["FIPS"] = df["FIPS"].str.zfill(5)

# Industry as string
df["industry"] = df["industry"].astype(str).str.strip()
df["industry_name"] = df["industry_name"].astype(str)

# Numeric coercion for relevant cols
for c in ["Year", "quarter", "Population"] + econ_cols + health_count_cols + health_rate_cols:
    if c in df.columns:
        df[c] = to_num_clean(df[c])

# Filter year window
df = df[(df["Year"] >= 2014) & (df["Year"] <= 2024)].copy()

# Quarter normalization 0..3 -> 1..4
if "quarter" in df.columns:
    qmin, qmax = df["quarter"].min(), df["quarter"].max()
    if pd.notna(qmin) and pd.notna(qmax) and qmin >= 0 and qmax <= 3:
        df["quarter"] = df["quarter"] + 1

# ---------------------------------------------------------
# STEP 1: HEALTH (County-Year -> State-Year), de-dup first
# ---------------------------------------------------------
health_keep = ["State", "Year", "FIPS", "Population"] + health_count_cols + health_rate_cols
county_health_raw = df[health_keep].copy()

county_health = (
    county_health_raw
    .sort_values(["State", "FIPS", "Year"])
    .groupby(["State", "Year", "FIPS"], as_index=False)
    .agg({**{"Population": "first"},
          **{c: "first" for c in health_count_cols},
          **{c: "first" for c in health_rate_cols}})
)

# Create numerators for weighted averages
for col in health_rate_cols:
    county_health[f"{col}_num"] = county_health[col] * county_health["Population"]

# Aggregate to state-year
agg_dict = {"state_total_pop": ("Population", "sum")}
for col in health_count_cols:
    agg_dict[f"STATE_{col}"] = (col, sum_mc1)
for col in health_rate_cols:
    agg_dict[f"{col}_num_sum"] = (f"{col}_num", sum_mc1)

state_health = county_health.groupby(["State", "Year"], as_index=False).agg(**agg_dict)

# finalize weighted rates
for col in health_rate_cols:
    state_health[f"STATE_{col}"] = state_health[f"{col}_num_sum"] / state_health["state_total_pop"].replace({0: np.nan})
    state_health.drop(columns=[f"{col}_num_sum"], inplace=True)

# optional: convert health counts to per-100k at state level
for col in health_count_cols:
    state_health[f"STATE_{col}_per_100k"] = (
        100000 * state_health[f"STATE_{col}"] / state_health["state_total_pop"].replace({0: np.nan})
    )

# ---------------------------------------------------------
# STEP 2: ECON (County-Quarter-Industry -> State-Year)
# Key: avoid double counting totals vs components + handle suppression flags safely
# ---------------------------------------------------------

df_econ = df[["State", "Year", "quarter", "FIPS", "industry", "industry_name", "Population"] + econ_cols +
            [v for v in flag_map.values() if v in df.columns]].copy()

# --- 2A) Infer which flag value means "suppressed" (so we don't accidentally wipe all data)
suppressed_values = {}
for col, flag in flag_map.items():
    if flag in df_econ.columns:
        suppressed_values[col] = infer_suppressed_value(df_econ, col, flag)
    else:
        suppressed_values[col] = set()

# Apply suppression -> NaN ONLY if we inferred a value confidently
for col, flag in flag_map.items():
    vals = suppressed_values.get(col, set())
    if flag in df_econ.columns and vals:
        f = df_econ[flag].astype(str).str.strip()
        df_econ.loc[f.isin(vals), col] = np.nan

# --- 2B) Decide how to handle industry totals
# Strategy:
#   If "total" rows exist and have decent coverage -> use total only (no double count)
#   Else -> sum across non-total industries, excluding totals if present

is_total = (
    df_econ["industry"].isin(TOTAL_CODES)
    | df_econ["industry_name"].str.lower().str.contains(TOTAL_NAME_PAT, regex=True, na=False)
)

econ_total = df_econ[is_total].copy()
econ_parts = df_econ[~is_total].copy()

# Coverage check: how many State-Year-Quarter groups have any non-missing employment in total rows?
tot_grp = econ_total.groupby(["State", "Year", "quarter"])["Employment_Count"].apply(lambda s: s.notna().any())
total_coverage = tot_grp.mean() if len(tot_grp) else 0.0

MODE = "total_only" if (len(econ_total) > 0 and total_coverage >= 0.60) else "sum_parts"

base_econ = econ_total if MODE == "total_only" else econ_parts

# Wage bill (use employment-weighted earnings; if Employment_Count missing, wage_bill missing)
base_econ["wage_bill"] = base_econ["Earnings_Beginning_Qtr"] * base_econ["Employment_Count"]

# --- 2C) State-quarter totals (sum across counties and (if parts) industries)
state_qtr = (
    base_econ.groupby(["State", "Year", "quarter"], as_index=False)
    .agg(
        state_emp_qtr=("Employment_Count", sum_mc1),
        state_hires_qtr=("New_Hires", sum_mc1),
        state_wage_bill_qtr=("wage_bill", sum_mc1),
    )
)

state_qtr["state_avg_earnings_qtr"] = safe_div(state_qtr["state_wage_bill_qtr"], state_qtr["state_emp_qtr"])

# --- 2D) State-year aggregation
state_annual = (
    state_qtr.groupby(["State", "Year"], as_index=False)
    .agg(
        state_emp_avg=("state_emp_qtr", "mean"),   # stock -> average across quarters
        state_emp_sum=("state_emp_qtr", sum_mc1),  # person-quarters
        state_hires_total=("state_hires_qtr", sum_mc1),  # flow -> sum across quarters
        state_wage_bill_total=("state_wage_bill_qtr", sum_mc1),
        state_avg_earnings_meanq=("state_avg_earnings_qtr", "mean"),
    )
)

# Employment-weighted earnings across quarters (recommended)
state_annual["state_avg_earnings"] = safe_div(state_annual["state_wage_bill_total"], state_annual["state_emp_sum"])

# ---------------------------------------------------------
# STEP 3: Merge Econ + Health (annual)
# ---------------------------------------------------------
state_df = state_annual.merge(state_health, on=["State", "Year"], how="left")

# ---------------------------------------------------------
# STEP 4: Core annual rates + YoY (your existing features)
# ---------------------------------------------------------
pop = state_df["state_total_pop"].replace({0: np.nan})
emp_avg = state_df["state_emp_avg"].replace({0: np.nan})

state_df["econ_emp_per_1k"] = 1000 * safe_div(state_df["state_emp_avg"], pop)
state_df["econ_hires_per_1k"] = 1000 * safe_div(state_df["state_hires_total"], pop)
state_df["econ_hire_rate_annual"] = safe_div(state_df["state_hires_total"], emp_avg)

state_df = state_df.sort_values(["State", "Year"]).copy()

def yoy_growth(s):
    prev = s.shift(1)
    return safe_div(s - prev, prev)

state_df["growth_emp_yoy"] = state_df.groupby("State")["state_emp_avg"].transform(yoy_growth)
state_df["growth_earn_yoy"] = state_df.groupby("State")["state_avg_earnings"].transform(yoy_growth)
state_df["growth_hires_yoy"] = state_df.groupby("State")["state_hires_total"].transform(yoy_growth)

# ---------------------------------------------------------
# STEP 5: NEW TOPIC: Relativity + Divergence from US trends
# ---------------------------------------------------------
# Level rates
state_df["emp_rate"]   = safe_div(state_df["state_emp_avg"], pop)
state_df["hires_rate"] = safe_div(state_df["state_hires_total"], pop)
state_df["earn_level"] = state_df["state_avg_earnings"]  # rename for clarity

# US aggregates by year (consistent definitions)
us = (
    state_df.groupby("Year", as_index=False)
    .apply(lambda s: pd.Series({
        "us_pop": np.nansum(s["state_total_pop"]),
        "us_emp_avg": np.nansum(s["state_emp_avg"]),
        "us_hires_total": np.nansum(s["state_hires_total"]),
        "us_earn_level": weighted_mean(s["earn_level"].to_numpy(), s["state_emp_sum"].to_numpy())
    }))
    .reset_index(drop=True)
)

us["us_emp_rate"] = safe_div(us["us_emp_avg"], us["us_pop"])
us["us_hires_rate"] = safe_div(us["us_hires_total"], us["us_pop"])

state_df = state_df.merge(us[["Year", "us_emp_rate", "us_hires_rate", "us_earn_level"]], on="Year", how="left")

# Relativity indices (ratio + log ratio)
state_df["rel_emp_idx"] = safe_div(state_df["emp_rate"], state_df["us_emp_rate"])
state_df["rel_hires_idx"] = safe_div(state_df["hires_rate"], state_df["us_hires_rate"])
state_df["rel_earn_idx"] = safe_div(state_df["earn_level"], state_df["us_earn_level"])

state_df["log_rel_emp"] = np.log(state_df["rel_emp_idx"])
state_df["log_rel_hires"] = np.log(state_df["rel_hires_idx"])
state_df["log_rel_earn"] = np.log(state_df["rel_earn_idx"])

# Divergence (growth gap vs US)
state_df["emp_rate_yoy"] = state_df.groupby("State")["emp_rate"].pct_change()
state_df["hires_rate_yoy"] = state_df.groupby("State")["hires_rate"].pct_change()
state_df["earn_level_yoy"] = state_df.groupby("State")["earn_level"].pct_change()

us_tr = us.sort_values("Year").copy()
us_tr["us_emp_rate_yoy"] = us_tr["us_emp_rate"].pct_change()
us_tr["us_hires_rate_yoy"] = us_tr["us_hires_rate"].pct_change()
us_tr["us_earn_level_yoy"] = us_tr["us_earn_level"].pct_change()

state_df = state_df.merge(us_tr[["Year","us_emp_rate_yoy","us_hires_rate_yoy","us_earn_level_yoy"]],
                          on="Year", how="left")

state_df["div_emp_rate_yoy"] = state_df["emp_rate_yoy"] - state_df["us_emp_rate_yoy"]
state_df["div_hires_rate_yoy"] = state_df["hires_rate_yoy"] - state_df["us_hires_rate_yoy"]
state_df["div_earnings_yoy"] = state_df["earn_level_yoy"] - state_df["us_earn_level_yoy"]

# Optional composite (z within year to remove scale)
for c in ["log_rel_emp","log_rel_hires","log_rel_earn","div_emp_rate_yoy","div_hires_rate_yoy","div_earnings_yoy"]:
    mu = state_df.groupby("Year")[c].transform("mean")
    sd = state_df.groupby("Year")[c].transform("std")
    state_df[c + "_z"] = (state_df[c] - mu) / sd

state_df["labor_rel_div_index"] = (
    state_df["log_rel_emp_z"] + state_df["log_rel_hires_z"] + state_df["log_rel_earn_z"]
    + state_df["div_emp_rate_yoy_z"]
)

# Final cleanup
state_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# ---------------------------------------------------------
# STEP 6: Quick diagnostics (crucial)
# ---------------------------------------------------------
print("ECON MODE:", MODE, "| total_coverage =", round(float(total_coverage), 3))
print("Inferred suppressed flag values used (empty = not applied):")
for k,v in suppressed_values.items():
    print(" ", k, ":", v)

print("\nNon-null rates in annual econ:")
print(state_df[["state_emp_avg","state_hires_total","state_avg_earnings"]].notna().mean().to_string())

print("\nShare of zeros among non-missing annual econ (should NOT be ~1.0):")
for c in ["state_emp_avg","state_hires_total"]:
    s = state_df[c]
    print(c, "zero_share=", float((s.fillna(np.nan) == 0).mean()))


  | df_econ["industry_name"].str.lower().str.contains(TOTAL_NAME_PAT, regex=True, na=False)


ECON MODE: total_only | total_coverage = 1.0
Inferred suppressed flag values used (empty = not applied):
  Employment_Count : {'5'}
  New_Hires : {'5'}
  Earnings_Beginning_Qtr : {'5'}

Non-null rates in annual econ:
state_emp_avg         1.0
state_hires_total     1.0
state_avg_earnings    1.0

Share of zeros among non-missing annual econ (should NOT be ~1.0):
state_emp_avg zero_share= 0.0
state_hires_total zero_share= 0.0


  .apply(lambda s: pd.Series({


In [12]:
state_df

Unnamed: 0,State,Year,state_emp_avg,state_emp_sum,state_hires_total,state_wage_bill_total,state_avg_earnings_meanq,state_avg_earnings,state_total_pop,STATE_# Alcohol-Impaired Driving Deaths,...,div_emp_rate_yoy,div_hires_rate_yoy,div_earnings_yoy,log_rel_emp_z,log_rel_hires_z,log_rel_earn_z,div_emp_rate_yoy_z,div_hires_rate_yoy_z,div_earnings_yoy_z,labor_rel_div_index
0,Alabama,2014,1500428.00,6001712.0,1098176.0,1.938636e+10,3229.825614,3230.138191,4843737.0,1273.0,...,,,,-1.181368,-1.220233,-0.906968,,,,
1,Alabama,2015,1528070.25,6112281.0,1162571.0,2.016044e+10,3297.805304,3298.350384,4854803.0,1204.0,...,-0.000769,0.013846,-0.008879,-1.191267,-1.289439,-0.938068,0.252237,0.520389,-0.486096,-3.166536
2,Alabama,2016,1552800.75,6211203.0,1218301.0,2.072056e+10,3335.534152,3335.997555,4866824.0,1274.0,...,0.001445,0.032739,-0.001396,-1.145744,-0.805934,-0.926453,0.380082,0.659294,0.071821,-2.498050
3,Alabama,2017,1570609.00,6282436.0,1258492.0,2.136434e+10,3400.619157,3400.645682,4877989.0,1260.0,...,0.000301,0.015026,-0.013222,-1.151876,-0.937398,-0.959140,0.465198,0.854117,-1.122061,-2.583216
4,Alabama,2018,1593598.00,6374392.0,1345355.0,2.233824e+10,3504.180435,3504.371411,4891628.0,1146.0,...,-0.000372,0.043352,-0.002924,-1.157168,-0.612184,-0.977010,0.240409,2.074639,-0.322340,-2.505953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
534,Wyoming,2020,197928.25,791713.0,184673.0,3.045369e+09,3844.774056,3846.556459,570864.0,219.0,...,0.010319,-0.036672,-0.070905,-0.344930,1.637040,-0.910118,0.288401,-0.725790,-2.708239,0.670394
535,Wyoming,2021,201439.00,805756.0,201970.0,3.218701e+09,3990.137719,3994.634706,572889.0,209.0,...,-0.003064,-0.095219,-0.038187,-0.352796,1.031721,-1.039499,0.034312,-1.789914,-1.469311,-0.326263
536,Wyoming,2022,207782.25,831129.0,206690.0,3.579169e+09,4303.279713,4306.393788,575106.0,197.0,...,-0.018378,0.006598,0.042850,-0.440596,1.036538,-0.921560,-0.805266,0.521618,1.739180,-1.130883
537,Wyoming,2023,213206.50,852826.0,199070.0,3.855354e+09,4520.934060,4520.680791,578239.0,197.0,...,0.006757,0.061931,0.013430,-0.406893,1.476636,-0.866242,1.078364,1.092691,1.420374,1.281865


In [10]:
# ============================================================
# FULL UPDATED PIPELINE (Lasso + RandomForest + LightGBM)
#  - Target: rel_emp_idx
#  - Test years: 2022-2024 only
#  - Time-aware tuning: walk-forward CV on TRAIN only
#  - Honest evaluation:
#      * Walk-forward OOF on TRAIN (for sanity / robustness)
#      * True holdout TEST (2022-2024)
#  - Robustness:
#      * Mean ensemble
#      * Ridge stacking with walk-forward OOF meta-features (leakage-safe)
#      * Rolling-origin (per-year) diagnostics
#  - X constraints enforced:
#      * NO emp-related predictors
#      * NO yoy/log/z/us/rel variables in X
#      * X includes hires + earnings + health
# ============================================================

import numpy as np
import pandas as pd

from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV, Lasso, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_absolute_error

try:
    from sklearn.metrics import root_mean_squared_error
except Exception:
    from sklearn.metrics import mean_squared_error
    def root_mean_squared_error(y_true, y_pred):
        return float(np.sqrt(mean_squared_error(y_true, y_pred)))

# ---- LightGBM ----
# If you get ImportError: install first:
#   pip install lightgbm
try:
    import lightgbm as lgb
except ImportError as e:
    raise ImportError(
        "LightGBM is not installed. Install it via: pip install lightgbm\n"
        "On some Windows setups you may need Visual C++ Build Tools."
    ) from e


# ------------------------------------------------------------
# Helpers
# ------------------------------------------------------------
def eval_metrics(y_true, y_pred):
    return {
        "r2": float(r2_score(y_true, y_pred)),
        "rmse": float(root_mean_squared_error(y_true, y_pred)),
        "mae": float(mean_absolute_error(y_true, y_pred)),
    }

def make_walk_forward_folds(years: np.ndarray, min_train_years: int = 1):
    years = np.asarray(years).astype(int)
    uniq = np.sort(np.unique(years))
    if len(uniq) < (min_train_years + 1):
        raise ValueError("Not enough unique years for walk-forward folds.")

    folds = []
    for i in range(min_train_years, len(uniq)):
        tr_years = uniq[:i]
        va_year = uniq[i]
        tr_idx = np.where(np.isin(years, tr_years))[0]
        va_idx = np.where(years == va_year)[0]
        if len(tr_idx) and len(va_idx):
            folds.append((tr_idx, va_idx))

    if not folds:
        raise ValueError("No folds created.")
    return folds

def oof_preds_walk_forward(model_fixed, X_train, y_train, folds):
    y_train = np.asarray(y_train)
    oof = np.full(len(y_train), np.nan, dtype=float)
    for tr_idx, va_idx in folds:
        m = clone(model_fixed)
        m.fit(X_train.iloc[tr_idx], y_train[tr_idx])
        oof[va_idx] = m.predict(X_train.iloc[va_idx])
    return oof

def pick_alpha_1se_from_lassocv(lasso_cv: LassoCV):
    """
    1SE rule: pick the MOST regularized alpha whose mean CV MSE is within
    1 std error of the minimum mean MSE.
    """
    mse_path = lasso_cv.mse_path_
    if mse_path.shape[0] == len(lasso_cv.alphas_):
        mean_mse = mse_path.mean(axis=1)
        std_mse  = mse_path.std(axis=1)
    else:
        mean_mse = mse_path.mean(axis=0)
        std_mse  = mse_path.std(axis=0)

    alphas = np.asarray(lasso_cv.alphas_).astype(float)
    i_min = int(np.argmin(mean_mse))
    thresh = mean_mse[i_min] + std_mse[i_min]
    candidates = alphas[mean_mse <= thresh]
    return float(np.max(candidates)) if len(candidates) else float(lasso_cv.alpha_)

def fit_eval_one(model_fixed, X_train, y_train, X_test, y_test, folds_oof, name="model"):
    # Walk-forward OOF on TRAIN
    oof = oof_preds_walk_forward(model_fixed, X_train, y_train, folds_oof)
    valid = np.isfinite(oof)

    # Fit on full TRAIN
    m = clone(model_fixed)
    m.fit(X_train, y_train)

    pred_tr_in = m.predict(X_train)
    pred_te = m.predict(X_test)

    out = {
        "model": name,
        "train_in": eval_metrics(y_train, pred_tr_in),
        "train_oof": eval_metrics(y_train[valid], oof[valid]) if valid.sum() else None,
        "test": eval_metrics(y_test, pred_te),
        "oof_kept": int(valid.sum()),
        "oof_dropped": int((~valid).sum()),
        "fitted": m
    }
    return out

def stack_ridge_oof(base_models_fixed, X_train, y_train, X_test, folds_oof, meta_alphas=None):
    """
    Ridge stacking using walk-forward OOF features (leakage-safe).
    - meta-train features are true OOF predictions by year
    - meta-test uses base models fit on full TRAIN
    """
    if meta_alphas is None:
        meta_alphas = np.logspace(-3, 3, 19)

    y_train = np.asarray(y_train)

    meta_train = np.column_stack([
        oof_preds_walk_forward(m, X_train, y_train, folds_oof)
        for m in base_models_fixed.values()
    ])
    valid = np.all(np.isfinite(meta_train), axis=1)
    if valid.sum() < 60:
        raise RuntimeError(f"Too few OOF rows for stacking meta-learner: kept={int(valid.sum())}")

    meta_test = np.column_stack([
        clone(m).fit(X_train, y_train).predict(X_test)
        for m in base_models_fixed.values()
    ])

    meta = RidgeCV(alphas=meta_alphas, cv=5)
    meta.fit(meta_train[valid], y_train[valid])

    ens_train_oof = np.full(len(y_train), np.nan, dtype=float)
    ens_train_oof[valid] = meta.predict(meta_train[valid])
    ens_test = meta.predict(meta_test)

    weights = pd.Series(meta.coef_, index=list(base_models_fixed.keys()))
    info = {"meta_kept": int(valid.sum()), "meta_dropped": int((~valid).sum())}
    return ens_train_oof, ens_test, valid, weights, info

def rolling_origin_eval(model_fixed, df_model, X_all, y_all, start_year=2017, end_year=2024):
    years = df_model["Year"].astype(int).values
    out = []
    for t in range(start_year, end_year + 1):
        tr_mask = years <= (t - 1)
        te_mask = years == t
        if tr_mask.sum() < 80 or te_mask.sum() < 30:
            continue
        m = clone(model_fixed)
        m.fit(X_all.loc[tr_mask], y_all[tr_mask])
        pred = m.predict(X_all.loc[te_mask])
        met = eval_metrics(y_all[te_mask], pred)
        out.append({"test_year": t, "train_n": int(tr_mask.sum()), "test_n": int(te_mask.sum()), **met})
    return pd.DataFrame(out)


# ============================================================
# 0) BUILD MODELING FRAME FROM state_df
# ============================================================
df_model = state_df.copy()
df_model["Year"] = pd.to_numeric(df_model["Year"], errors="coerce")
df_model = df_model[(df_model["Year"] >= 2014) & (df_model["Year"] <= 2024)].copy()
df_model["State"] = df_model["State"].astype(str)
df_model = df_model.dropna(subset=["Year"]).copy()
df_model["Year"] = df_model["Year"].astype(int)

target = "rel_emp_idx"
df_model[target] = pd.to_numeric(df_model[target], errors="coerce")
df_model = df_model[np.isfinite(df_model[target])].copy()
y_all = df_model[target].values

econ_x = [
    "state_hires_total",
    "econ_hires_per_1k",
    "state_avg_earnings",
    "state_avg_earnings_meanq",
    "state_total_pop",
]
health_x = [
    "STATE_% Children in Poverty",
    "STATE_% Uninsured",
    "STATE_% Adults with Obesity",
    "STATE_% Smokers",
    "STATE_% Fair or Poor Health",
    "STATE_Mentally Unhealthy Days",
    "STATE_Physically Unhealthy Days",
    "STATE_Food Environment Index",
    "STATE_Income Ratio",
    "STATE_Primary Care Physicians Rate",
    "STATE_Preventable Hospitalization Rate",
    "STATE_Violent Crime Rate",
    "STATE_Teen Birth Rate",
    "STATE_% With Access to Exercise Opportunities",
    "STATE_% Some College",
    "STATE_Premature Deaths_per_100k",
    "STATE_# Driving Deaths_per_100k",
    "STATE_# Alcohol-Impaired Driving Deaths_per_100k",
]

X_cols = [c for c in (econ_x + health_x) if c in df_model.columns]
X_all = df_model[X_cols].select_dtypes(include=[np.number]).copy()
X_all = X_all.dropna(axis=1, how="all")
X_all = X_all.loc[:, X_all.nunique(dropna=True) > 1]

# ---- HARD GUARDS: enforce your constraints ----
bad_patterns = [
    "state_emp", "Employment", "econ_emp", "emp_rate", "emp_per", "hire_to_emp",
    "growth_", "log_", "_yoy", "_z", "us_", "rel_"  # allow rel_* only as Y, not X
]
bad_in_X = [c for c in X_all.columns if any(p.lower() in c.lower() for p in bad_patterns)]
if bad_in_X:
    raise ValueError(f"Disallowed columns ended up in X: {bad_in_X}")

print("Target:", target)
print("Rows:", len(df_model), "| Features:", X_all.shape[1])
print("X columns:", X_all.columns.tolist())


# ============================================================
# 1) TIME SPLIT: Test = 2022-2024 only
# ============================================================
test_years = {2022, 2023, 2024}
train_mask = ~df_model["Year"].isin(test_years)
test_mask  =  df_model["Year"].isin(test_years)

X_train, X_test = X_all.loc[train_mask].copy(), X_all.loc[test_mask].copy()
y_train, y_test = y_all[train_mask.values], y_all[test_mask.values]
years_train = df_model.loc[train_mask, "Year"].astype(int).values

print("\nTrain years:", sorted(df_model.loc[train_mask, "Year"].unique()))
print("Test years :", sorted(df_model.loc[test_mask, "Year"].unique()))
print("Train n:", X_train.shape[0], "| Test n:", X_test.shape[0])

# Walk-forward folds on TRAIN only:
# - folds_tune used for hyperparam selection (validate later years in train window)
# - folds_oof used for honest OOF estimation (drops early years with no prior train history)
folds_tune = make_walk_forward_folds(years_train, min_train_years=4)  # validate 2018-2021
folds_oof  = make_walk_forward_folds(years_train, min_train_years=3)  # validate 2017-2021
print("folds_tune:", len(folds_tune), "| folds_oof:", len(folds_oof))


# ============================================================
# 2) MODEL A: LASSO (main, interpretable) + choose alpha by OOF sanity
# ============================================================
lasso_cv = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("lasso_cv", LassoCV(cv=folds_tune, random_state=42, n_alphas=300, max_iter=400000))
])
lasso_cv.fit(X_train, y_train)

alpha_min = float(lasso_cv.named_steps["lasso_cv"].alpha_)
alpha_1se = pick_alpha_1se_from_lassocv(lasso_cv.named_steps["lasso_cv"])

print("\nLasso alpha_min:", alpha_min, "| alpha_1se:", alpha_1se)

lasso_min = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("lasso", Lasso(alpha=alpha_min, max_iter=500000, random_state=42))
])
lasso_1se = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("lasso", Lasso(alpha=alpha_1se, max_iter=500000, random_state=42))
])

res_lasso_min = fit_eval_one(lasso_min, X_train, y_train, X_test, y_test, folds_oof, name="lasso_alpha_min")
res_lasso_1se = fit_eval_one(lasso_1se, X_train, y_train, X_test, y_test, folds_oof, name="lasso_alpha_1se")

# Choose lasso variant by OOF RMSE (more honest), tie-break by test RMSE
cand = sorted([res_lasso_min, res_lasso_1se], key=lambda r: (r["train_oof"]["rmse"], r["test"]["rmse"]))
res_lasso = cand[0]
print("\nChosen Lasso variant:", res_lasso["model"],
      "| OOF RMSE:", res_lasso["train_oof"]["rmse"],
      "| Test RMSE:", res_lasso["test"]["rmse"])


# ============================================================
# 3) MODEL B: RandomForest (nonlinear baseline) + tune
# ============================================================
rf_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("rf", RandomForestRegressor(random_state=42, n_jobs=-1))
])

rf_param_dist = {
    "rf__n_estimators": np.arange(300, 1101, 100),
    "rf__max_depth": [3, 4, 5, 6, 8, 10, None],
    "rf__min_samples_leaf": [2, 3, 5, 8, 10, 15, 20],
    "rf__min_samples_split": [10, 20, 30, 40, 60],
    "rf__max_features": [0.4, 0.5, 0.6, 0.7, "sqrt"],
    "rf__bootstrap": [True],
}

rf_search = RandomizedSearchCV(
    rf_pipe,
    param_distributions=rf_param_dist,
    n_iter=50,
    scoring="neg_root_mean_squared_error",
    cv=folds_tune,
    random_state=42,
    n_jobs=-1,
    verbose=0
)

print("\n--- Tuning RandomForest (train-only, walk-forward CV) ---")
rf_search.fit(X_train, y_train)
rf_best = rf_search.best_estimator_
print("RF best CV RMSE:", float(-rf_search.best_score_))
print("RF best params:", rf_search.best_params_)

res_rf = fit_eval_one(rf_best, X_train, y_train, X_test, y_test, folds_oof, name="rf_tuned")


# ============================================================
# 4) MODEL C: LightGBM (GBDT) + tune (time-aware CV)
# ============================================================
lgb_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),  # optional but keeps consistency
    ("lgb", lgb.LGBMRegressor(
        objective="regression",
        random_state=42,
        n_jobs=-1,
        verbose=-1
    ))
])

# Ranges tuned for small-ish panel data (avoid too-deep trees)
lgb_param_dist = {
    "lgb__n_estimators": np.arange(300, 1601, 100),
    "lgb__learning_rate": np.array([0.01, 0.02, 0.03, 0.05, 0.08, 0.10]),
    "lgb__num_leaves": np.array([15, 31, 47, 63, 95, 127]),
    "lgb__max_depth": np.array([-1, 3, 4, 5, 6, 7]),
    "lgb__min_child_samples": np.array([10, 15, 20, 30, 40, 60]),
    "lgb__subsample": np.array([0.6, 0.7, 0.8, 0.9, 1.0]),
    "lgb__colsample_bytree": np.array([0.6, 0.7, 0.8, 0.9, 1.0]),
    "lgb__reg_alpha": np.array([0.0, 0.01, 0.05, 0.1, 0.2, 0.5, 1.0]),
    "lgb__reg_lambda": np.array([0.0, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]),
}

lgb_search = RandomizedSearchCV(
    lgb_pipe,
    param_distributions=lgb_param_dist,
    n_iter=60,
    scoring="neg_root_mean_squared_error",
    cv=folds_tune,
    random_state=42,
    n_jobs=-1,
    verbose=0
)

print("\n--- Tuning LightGBM (train-only, walk-forward CV) ---")
lgb_search.fit(X_train, y_train)
lgb_best = lgb_search.best_estimator_
print("LGB best CV RMSE:", float(-lgb_search.best_score_))
print("LGB best params:", lgb_search.best_params_)

res_lgb = fit_eval_one(lgb_best, X_train, y_train, X_test, y_test, folds_oof, name="lgb_tuned")


# ============================================================
# 5) COMPARE MODELS + PICK BEST (by test_rmse, then oof_rmse sanity)
# ============================================================
results = [res_lasso, res_rf, res_lgb]

rows = []
for r in results:
    rows.append({
        "model": r["model"],
        "oof_r2": None if r["train_oof"] is None else r["train_oof"]["r2"],
        "oof_rmse": None if r["train_oof"] is None else r["train_oof"]["rmse"],
        "test_r2": r["test"]["r2"],
        "test_rmse": r["test"]["rmse"],
        "test_mae": r["test"]["mae"],
        "oof_kept": r["oof_kept"],
        "oof_dropped": r["oof_dropped"],
    })

perf = pd.DataFrame(rows).sort_values(["test_rmse", "oof_rmse"])
print("\n=== MODEL COMPARISON (sorted by test_rmse then oof_rmse) ===")
print(perf)

best_name = perf.iloc[0]["model"]
best_fitted = {r["model"]: r["fitted"] for r in results}[best_name]
print("\n>>> BEST SINGLE MODEL:", best_name)


# ============================================================
# 6) ENSEMBLES as ROBUSTNESS CHECK (not the headline model)
#    - mean ensemble
#    - ridge stacking (OOF, leakage-safe)
# ============================================================
# Recreate fixed (cloneable) model objects for stacking:
# IMPORTANT: use the "fixed" pipelines (no internal CV) where applicable.
lasso_for_stack = clone(lasso_min if res_lasso["model"] == "lasso_alpha_min" else lasso_1se)
rf_for_stack    = clone(rf_best)
lgb_for_stack   = clone(lgb_best)

base_fixed = {"lasso": lasso_for_stack, "rf": rf_for_stack, "lgb": lgb_for_stack}

# (A) Mean ensemble
preds_test = []
for m in base_fixed.values():
    mf = clone(m).fit(X_train, y_train)
    preds_test.append(mf.predict(X_test))
pred_mean = np.mean(np.column_stack(preds_test), axis=1)
print("\nMean-ensemble TEST metrics:", eval_metrics(y_test, pred_mean))

# (B) Ridge stacking with walk-forward OOF meta-features
ens_tr_oof, ens_te, valid_meta, weights, info = stack_ridge_oof(base_fixed, X_train, y_train, X_test, folds_oof)
print("\nStacking meta info:", info)
print("Stacking weights:\n", weights)
print("Stacking TRAIN OOF metrics:", eval_metrics(y_train[valid_meta], ens_tr_oof[valid_meta]))
print("Stacking TEST metrics:", eval_metrics(y_test, ens_te))


# ============================================================
# 7) Rolling-origin (per-year) diagnostics (robustness & break detection)
# ============================================================
print("\n=== Rolling-origin per-year diagnostics ===")
start_year = 2017
end_year = 2024

roll_tables = []
for name, m in base_fixed.items():
    tab = rolling_origin_eval(m, df_model, X_all, y_all, start_year=start_year, end_year=end_year)
    tab["model"] = name
    roll_tables.append(tab)

roll_all = pd.concat(roll_tables, ignore_index=True).sort_values(["model", "test_year"])
print(roll_all)


# ============================================================
# 8) Interpretability hook for paper (Lasso coefficients)
# ============================================================
print("\n=== LASSO COEFFICIENTS (chosen variant) ===")
chosen_lasso_fitted = res_lasso["fitted"]
coef = chosen_lasso_fitted.named_steps["lasso"].coef_
coef_s = pd.Series(coef, index=X_train.columns).sort_values(key=lambda s: np.abs(s), ascending=False)

print("Non-zero coefficients:", int((coef_s != 0).sum()), "out of", len(coef_s))
print("\nTop 20 by |coef|:")
print(coef_s.head(20))

# Optional: export
# coef_s.to_csv("lasso_coefficients_rel_emp_idx.csv")


Target: rel_emp_idx
Rows: 539 | Features: 23
X columns: ['state_hires_total', 'econ_hires_per_1k', 'state_avg_earnings', 'state_avg_earnings_meanq', 'state_total_pop', 'STATE_% Children in Poverty', 'STATE_% Uninsured', 'STATE_% Adults with Obesity', 'STATE_% Smokers', 'STATE_% Fair or Poor Health', 'STATE_Mentally Unhealthy Days', 'STATE_Physically Unhealthy Days', 'STATE_Food Environment Index', 'STATE_Income Ratio', 'STATE_Primary Care Physicians Rate', 'STATE_Preventable Hospitalization Rate', 'STATE_Violent Crime Rate', 'STATE_Teen Birth Rate', 'STATE_% With Access to Exercise Opportunities', 'STATE_% Some College', 'STATE_Premature Deaths_per_100k', 'STATE_# Driving Deaths_per_100k', 'STATE_# Alcohol-Impaired Driving Deaths_per_100k']

Train years: [2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
Test years : [2022, 2023, 2024]
Train n: 395 | Test n: 144
folds_tune: 4 | folds_oof: 5

Lasso alpha_min: 0.00011905765105954004 | alpha_1se: 0.01964032247770044

Chosen Lasso variant: l

In [13]:
# ============================================================
# CELL 1: Global interpretability for the STACKED ensemble
#  - Feature-space attributions (magnitude + direction)
#  - Leakage-safe background: TRAIN only (2014-2021)
#  - Explain set: TEST only (2022-2024) by default
#
# Requires objects from your modeling cell:
#   df_model, X_all, y_all
#   train_mask, test_mask
#   base_fixed  ({"lasso":..., "rf":..., "lgb":...})  [cloneable pipelines]
#   weights     (pd.Series with index ["lasso","rf","lgb"])
# ============================================================

import numpy as np
import pandas as pd

# --- SHAP import ---
try:
    import shap
except ImportError:
    raise ImportError("Install SHAP first: pip install shap")

# ----------------------------
# Helpers: consistent transforms and SHAP per model
# ----------------------------
def _get_feature_names(X: pd.DataFrame):
    return list(X.columns)

def _impute_only(pipe, X: pd.DataFrame) -> pd.DataFrame:
    """Return imputed X as DataFrame."""
    imp = pipe.named_steps["imputer"]
    X_imp = pd.DataFrame(imp.transform(X), columns=_get_feature_names(X), index=X.index)
    return X_imp

def linear_shap_from_lasso_pipeline(lasso_pipe, X_background: pd.DataFrame, X_explain: pd.DataFrame):
    """
    Exact additive decomposition for Pipeline(imputer, scaler, lasso) in the *original feature names*.
    Works by computing contributions in standardized space:
        phi_j = coef_j * (x_scaled_j - E[x_scaled_j])
    Returns:
        shap_vals: (n, p) array
        base_value: float
        X_explain_imp: DataFrame (imputed, original units)
    """
    imp = lasso_pipe.named_steps["imputer"]
    sc  = lasso_pipe.named_steps["scaler"]
    lm  = lasso_pipe.named_steps["lasso"]

    # impute
    Xb_imp = pd.DataFrame(imp.transform(X_background), columns=_get_feature_names(X_background), index=X_background.index)
    Xe_imp = pd.DataFrame(imp.transform(X_explain),    columns=_get_feature_names(X_explain),    index=X_explain.index)

    # scale
    Xb_sc = sc.transform(Xb_imp)
    Xe_sc = sc.transform(Xe_imp)

    mean_sc = np.nanmean(Xb_sc, axis=0)  # background mean in standardized space
    coefs = np.asarray(lm.coef_, dtype=float)

    shap_vals = (Xe_sc - mean_sc) * coefs  # (n, p)
    base_value = float(lm.intercept_ + np.dot(mean_sc, coefs))
    return shap_vals, base_value, Xe_imp

def tree_shap_from_imputed_pipeline(tree_pipe, model_step_name: str,
                                   X_background: pd.DataFrame, X_explain: pd.DataFrame):
    """
    SHAP for Pipeline(imputer, <tree_model>) using TreeExplainer.
    Returns:
        shap_vals: (n, p) array
        base_value: float
        Xe_imp: DataFrame (imputed)
    """
    imp = tree_pipe.named_steps["imputer"]
    model = tree_pipe.named_steps[model_step_name]

    Xb_imp = pd.DataFrame(imp.transform(X_background), columns=_get_feature_names(X_background), index=X_background.index)
    Xe_imp = pd.DataFrame(imp.transform(X_explain),    columns=_get_feature_names(X_explain),    index=X_explain.index)

    explainer = shap.TreeExplainer(model, data=Xb_imp, feature_perturbation="interventional")
    shap_vals = explainer.shap_values(Xe_imp)
    # shap_vals may be list for multioutput; but you're regression so ensure array
    shap_vals = np.asarray(shap_vals)
    base_value = float(np.asarray(explainer.expected_value).reshape(-1)[0])
    return shap_vals, base_value, Xe_imp

def combine_ensemble_shap(shap_by_model: dict, base_by_model: dict, meta_weights: pd.Series, meta_intercept: float = 0.0):
    """
    Combine base-model SHAP values into stacked ensemble SHAP values:
        f_ens(x) = meta_intercept + sum_m w_m f_m(x)
        phi_ens  = sum_m w_m phi_m
        base_ens = meta_intercept + sum_m w_m base_m
    """
    # ensure aligned order and same feature dimension
    keys = list(meta_weights.index)
    first = keys[0]
    n, p = shap_by_model[first].shape

    shap_ens = np.zeros((n, p), dtype=float)
    base_ens = float(meta_intercept)
    for k in keys:
        w = float(meta_weights[k])
        shap_ens += w * np.asarray(shap_by_model[k], dtype=float)
        base_ens += w * float(base_by_model[k])
    return shap_ens, base_ens

def global_shap_tables(shap_vals: np.ndarray, feature_names: list):
    """
    Returns two pd.Series:
      - mean_abs_shap: importance magnitude
      - mean_shap: signed direction (average contribution)
    """
    mean_abs = pd.Series(np.abs(shap_vals).mean(axis=0), index=feature_names).sort_values(ascending=False)
    mean_signed = pd.Series(shap_vals.mean(axis=0), index=feature_names).loc[mean_abs.index]
    return mean_abs, mean_signed


# ----------------------------
# 1) Define background (TRAIN only) and explain set (TEST only by default)
# ----------------------------
X_train_bg = X_all.loc[train_mask].copy()   # background: 2014-2021
X_explain  = X_all.loc[test_mask].copy()    # explain: 2022-2024 holdout

feature_names = _get_feature_names(X_train_bg)

# ----------------------------
# 2) Fit base models on full TRAIN (no leakage into TEST)
# ----------------------------
base_fitted = {}
for name, pipe in base_fixed.items():
    base_fitted[name] = pipe.fit(X_train_bg, y_all[train_mask.values])

# ----------------------------
# 3) Compute SHAP per base model in original feature space
# ----------------------------
shap_by_model = {}
baseval_by_model = {}

# LASSO (pipeline with scaler)
sh_lasso, bv_lasso, Xe_imp_lasso = linear_shap_from_lasso_pipeline(
    base_fitted["lasso"], X_train_bg, X_explain
)
shap_by_model["lasso"] = sh_lasso
baseval_by_model["lasso"] = bv_lasso

# RF (tree)
sh_rf, bv_rf, Xe_imp_rf = tree_shap_from_imputed_pipeline(
    base_fitted["rf"], model_step_name="rf", X_background=X_train_bg, X_explain=X_explain
)
shap_by_model["rf"] = sh_rf
baseval_by_model["rf"] = bv_rf

# LGB (tree)
sh_lgb, bv_lgb, Xe_imp_lgb = tree_shap_from_imputed_pipeline(
    base_fitted["lgb"], model_step_name="lgb", X_background=X_train_bg, X_explain=X_explain
)
shap_by_model["lgb"] = sh_lgb
baseval_by_model["lgb"] = bv_lgb

# ----------------------------
# 4) Combine into stacked-ensemble SHAP using your stacking weights
#    (Meta intercept is unknown here because you didnâ€™t keep the fitted meta model.
#     Thatâ€™s OK for importance ranking; SHAP magnitudes are unaffected.)
# ----------------------------
sh_ens, base_ens = combine_ensemble_shap(
    shap_by_model=shap_by_model,
    base_by_model=baseval_by_model,
    meta_weights=weights,
    meta_intercept=0.0
)

# ----------------------------
# 5) Global tables: magnitude + direction (on TEST years 2022-2024)
# ----------------------------
ens_mean_abs, ens_mean_signed = global_shap_tables(sh_ens, feature_names)

ens_global_table = pd.DataFrame({
    "mean_abs_shap_ens": ens_mean_abs,
    "mean_shap_signed_ens": ens_mean_signed
})

print("\n=== STACKED ENSEMBLE GLOBAL (TEST 2022-2024) SHAP ===")
print("Stacking weights used:\n", weights.sort_values(ascending=False))
print("\nTop 20 features by ensemble mean(|SHAP|):")
print(ens_global_table.head(20))

# Optional: base-model comparison tables
for m in ["lasso", "rf", "lgb"]:
    m_abs, m_signed = global_shap_tables(shap_by_model[m], feature_names)
    tab = pd.DataFrame({"mean_abs_shap": m_abs, "mean_shap_signed": m_signed})
    print(f"\n--- {m.upper()} global SHAP (TEST 2022-2024), top 10 ---")
    print(tab.head(10))

# ----------------------------
# 6) Optional plots (comment out if running headless)
# ----------------------------
# Xe_imp_for_plot = Xe_imp_lgb  # any imputed X with same columns is fine for plotting
# shap.summary_plot(sh_ens, Xe_imp_for_plot, show=False)            # direction + magnitude
# shap.summary_plot(sh_ens, Xe_imp_for_plot, plot_type="bar", show=False)  # magnitude only





=== STACKED ENSEMBLE GLOBAL (TEST 2022-2024) SHAP ===
Stacking weights used:
 rf       0.507854
lgb      0.405861
lasso    0.202393
dtype: float64

Top 20 features by ensemble mean(|SHAP|):
                                                  mean_abs_shap_ens  \
STATE_% Some College                                       0.033261   
STATE_# Driving Deaths_per_100k                            0.023598   
econ_hires_per_1k                                          0.014847   
STATE_% Fair or Poor Health                                0.010264   
STATE_Violent Crime Rate                                   0.009405   
state_total_pop                                            0.008976   
STATE_Physically Unhealthy Days                            0.008756   
STATE_Food Environment Index                               0.008280   
STATE_# Alcohol-Impaired Driving Deaths_per_100k           0.007913   
STATE_Primary Care Physicians Rate                         0.006824   
STATE_% Children in Poverty 

In [14]:
# ============================================================
# CELL 2: Stability of GLOBAL ensemble ranking (rolling-origin SHAP)
#  - For each year t:
#      train = years <= t-1
#      test  = year == t
#      fit base models on train
#      fit ridge stacker using walk-forward OOF preds in train (leakage-safe)
#      compute ensemble SHAP importance on test year t
#  - Summarize stability across years: top-k freq, mean rank, meanÂ±std importance
#
# Uses your existing helper functions:
#   make_walk_forward_folds, oof_preds_walk_forward
# and uses the same base_fixed pipelines (already tuned) for consistency.
# ============================================================

import numpy as np
import pandas as pd

# Ensure shap imported
try:
    import shap
except ImportError:
    raise ImportError("Install SHAP first: pip install shap")

def fit_meta_ridge_from_oof(base_models_fixed: dict, X_tr: pd.DataFrame, y_tr: np.ndarray,
                           years_tr: np.ndarray, min_train_years_oof: int = 3, meta_alphas=None):
    """
    Fit ridge stacker using leakage-safe walk-forward OOF predictions within TRAIN.
    Returns:
        meta_model (fitted RidgeCV),
        meta_weights (pd.Series),
        meta_intercept (float),
        base_fitted_full_train (dict of fitted base pipelines)
    """
    if meta_alphas is None:
        meta_alphas = np.logspace(-3, 3, 19)

    folds_oof = make_walk_forward_folds(years_tr, min_train_years=min_train_years_oof)

    # OOF meta-features
    meta_train = np.column_stack([
        oof_preds_walk_forward(m, X_tr, y_tr, folds_oof)
        for m in base_models_fixed.values()
    ])
    valid = np.all(np.isfinite(meta_train), axis=1)
    if valid.sum() < 60:
        raise RuntimeError(f"Too few OOF rows for stacking meta-learner: kept={int(valid.sum())}")

    # Fit base models on full train for later prediction + SHAP
    base_full = {k: m.fit(X_tr, y_tr) for k, m in base_models_fixed.items()}

    # Fit meta model (ridge)
    from sklearn.linear_model import RidgeCV
    meta = RidgeCV(alphas=meta_alphas, cv=5)
    meta.fit(meta_train[valid], y_tr[valid])

    w = pd.Series(meta.coef_, index=list(base_models_fixed.keys()))
    b0 = float(meta.intercept_)
    return meta, w, b0, base_full

def shap_for_base_models(base_full: dict, X_bg: pd.DataFrame, X_te: pd.DataFrame):
    """
    Compute SHAP values for each base model on X_te with background X_bg (both are DataFrames).
    Returns:
      shap_by_model: dict name -> (n_test, p) array
      base_by_model: dict name -> base_value float
      X_te_imp_any:  imputed DataFrame with feature names (usable for plots)
    """
    shap_by_model = {}
    base_by_model = {}
    X_te_imp_any = None

    # LASSO
    sh_lasso, bv_lasso, X_te_imp_lasso = linear_shap_from_lasso_pipeline(
        base_full["lasso"], X_bg, X_te
    )
    shap_by_model["lasso"] = sh_lasso
    base_by_model["lasso"] = bv_lasso
    X_te_imp_any = X_te_imp_lasso

    # RF
    sh_rf, bv_rf, X_te_imp_rf = tree_shap_from_imputed_pipeline(
        base_full["rf"], "rf", X_bg, X_te
    )
    shap_by_model["rf"] = sh_rf
    base_by_model["rf"] = bv_rf
    X_te_imp_any = X_te_imp_any if X_te_imp_any is not None else X_te_imp_rf

    # LGB
    sh_lgb, bv_lgb, X_te_imp_lgb = tree_shap_from_imputed_pipeline(
        base_full["lgb"], "lgb", X_bg, X_te
    )
    shap_by_model["lgb"] = sh_lgb
    base_by_model["lgb"] = bv_lgb
    X_te_imp_any = X_te_imp_any if X_te_imp_any is not None else X_te_imp_lgb

    return shap_by_model, base_by_model, X_te_imp_any

def rank_series_desc(s: pd.Series) -> pd.Series:
    """1 = most important. Ties get average rank."""
    return s.rank(ascending=False, method="average")

# ----------------------------
# Rolling-origin SHAP stability
# ----------------------------
years_all = df_model["Year"].astype(int).values
feature_names = list(X_all.columns)

start_year = 2018
end_year = 2024
top_k = 10

per_year_tables = []
per_year_rank = []

for t in range(start_year, end_year + 1):
    tr_mask = years_all <= (t - 1)
    te_mask = years_all == t

    if tr_mask.sum() < 120 or te_mask.sum() < 30:
        continue

    X_tr = X_all.loc[tr_mask].copy()
    y_tr = y_all[tr_mask]
    years_tr = years_all[tr_mask]

    X_te = X_all.loc[te_mask].copy()

    try:
        meta, w_t, b0_t, base_full = fit_meta_ridge_from_oof(
            base_models_fixed=base_fixed,
            X_tr=X_tr,
            y_tr=y_tr,
            years_tr=years_tr,
            min_train_years_oof=3
        )
    except Exception as e:
        print(f"[SKIP year {t}] stacking fit failed: {e}")
        continue

    # SHAP per base model, background = X_tr only (no leakage)
    shap_by_model, base_by_model, X_te_imp_any = shap_for_base_models(base_full, X_tr, X_te)

    # Combine to ensemble SHAP for year t
    sh_ens_t, base_ens_t = combine_ensemble_shap(shap_by_model, base_by_model, w_t, meta_intercept=b0_t)

    # Compute year-t global importance (mean abs) + direction (mean signed) on test year t
    mean_abs_t = pd.Series(np.abs(sh_ens_t).mean(axis=0), index=feature_names).sort_values(ascending=False)
    mean_signed_t = pd.Series(sh_ens_t.mean(axis=0), index=feature_names).loc[mean_abs_t.index]

    tab_t = pd.DataFrame({
        "test_year": t,
        "feature": mean_abs_t.index,
        "mean_abs_shap": mean_abs_t.values,
        "mean_shap_signed": mean_signed_t.values
    })
    per_year_tables.append(tab_t)

    # ranking for stability
    ranks_t = rank_series_desc(mean_abs_t)
    per_year_rank.append(pd.DataFrame({"test_year": t, "feature": ranks_t.index, "rank": ranks_t.values}))

# Combine all year tables
if not per_year_tables:
    raise RuntimeError("No rolling-origin SHAP tables were produced. Check sample sizes / years / folds.")

imp_long = pd.concat(per_year_tables, ignore_index=True)
rank_long = pd.concat(per_year_rank, ignore_index=True)

# ----------------------------
# Stability summary across years
# ----------------------------
years_used = sorted(imp_long["test_year"].unique())
n_years_used = len(years_used)

# Top-k frequency
topk_flags = (
    rank_long.assign(in_topk=lambda d: d["rank"] <= top_k)
             .groupby("feature")["in_topk"].mean()
             .sort_values(ascending=False)
)

# Mean rank + variability
rank_stats = (rank_long.groupby("feature")["rank"]
                      .agg(mean_rank="mean", std_rank="std", n_years="count"))

# Mean(|SHAP|) summary
imp_stats = (imp_long.groupby("feature")["mean_abs_shap"]
                    .agg(mean_abs_shap_mean="mean",
                         mean_abs_shap_std="std",
                         mean_abs_shap_median="median"))

# Signed direction summary
dir_stats = (imp_long.groupby("feature")["mean_shap_signed"]
                    .agg(mean_signed_mean="mean", mean_signed_std="std"))

stability = (imp_stats.join(dir_stats, how="left")
                     .join(rank_stats, how="left")
                     .join(topk_flags.rename(f"top{top_k}_freq"), how="left")
                     .sort_values(["top10_freq" if top_k == 10 else f"top{top_k}_freq",
                                   "mean_abs_shap_mean"], ascending=[False, False]))

print(f"\n=== Rolling-origin ensemble SHAP stability ({start_year}-{end_year}) ===")
print("Years used:", years_used, "| count:", n_years_used)
print(f"\nTop 20 stable features (by top-{top_k} frequency, then mean importance):")
print(stability.head(20))

# Optional: show per-year top-k lists (quick sanity)
print("\nPer-year top features (top 8 by mean|SHAP|):")
for y in years_used:
    top8 = (imp_long[imp_long["test_year"] == y]
            .sort_values("mean_abs_shap", ascending=False)
            .head(8)[["feature", "mean_abs_shap", "mean_shap_signed"]])
    print(f"\nYear {y}:\n", top8.to_string(index=False))

# Optional: export for paper tables
# stability.to_csv("ensemble_shap_stability_summary.csv", index=True)
# imp_long.to_csv("ensemble_shap_importance_by_year.csv", index=False)
# rank_long.to_csv("ensemble_shap_rank_by_year.csv", index=False)


[SKIP year 2018] stacking fit failed: Too few OOF rows for stacking meta-learner: kept=49





=== Rolling-origin ensemble SHAP stability (2018-2024) ===
Years used: [2019, 2020, 2021, 2022, 2023, 2024] | count: 6

Top 20 stable features (by top-10 frequency, then mean importance):
                                                  mean_abs_shap_mean  \
feature                                                                
STATE_% Some College                                        0.029003   
STATE_# Driving Deaths_per_100k                             0.022715   
econ_hires_per_1k                                           0.018165   
STATE_Primary Care Physicians Rate                          0.010688   
STATE_Food Environment Index                                0.009345   
state_total_pop                                             0.007756   
STATE_# Alcohol-Impaired Driving Deaths_per_100k            0.006187   
STATE_% Fair or Poor Health                                 0.009330   
STATE_Violent Crime Rate                                    0.009301   
STATE_Physically Un

In [22]:
import numpy as np
import pandas as pd

# ---------------------------------------------------------
# SETUP: Define your column lists carefully
# ---------------------------------------------------------

# 1) RAW COUNTS -> SUM across counties (after de-dup at county-year)
health_count_cols = [
    '# Alcohol-Impaired Driving Deaths',
    '# Driving Deaths',
    'Premature Deaths'   # confirm this is a count (not a rate)
]

# 2) RATES / INDICES / PERCENTS -> POP-WEIGHTED average across counties
health_rate_cols = [
    '% Adults with Obesity',
    '% Children in Poverty',
    '% Children in Single-Parent Households',
    '% Drive Alone to Work',
    '% Excessive Drinking',
    '% Fair or Poor Health',
    '% Long Commute - Drives Alone',
    '% Low Birthweight',
    '% Severe Housing Problems',
    '% Smokers',
    '% Some College',
    '% Unemployed',
    '% Uninsured',
    '% With Access to Exercise Opportunities',
    'Food Environment Index',
    'Income Ratio',
    'Mentally Unhealthy Days',
    'Physically Unhealthy Days',
    'Preventable Hospitalization Rate',
    'Primary Care Physicians Rate',
    'Social Association Rate',
    'Teen Birth Rate',
    'Violent Crime Rate'
]

# 3) Numeric columns to coerce
numeric_cols = [
    'Year', 'quarter', 'FIPS', 'Population',
    'Employment_Count', 'New_Hires', 'Earnings_Beginning_Qtr'
] + health_count_cols + health_rate_cols

# ---------------------------------------------------------
# STEP 0: PRE-PROCESSING
# ---------------------------------------------------------
df = merged_full.copy()

# Coerce numeric types
for c in numeric_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')

# Filter Year Window
df = df[(df['Year'] >= 2014) & (df['Year'] <= 2024)].copy()

# Quarter normalization (in case your data uses 0..3)
if 'quarter' in df.columns:
    qmin, qmax = df['quarter'].min(), df['quarter'].max()
    if pd.notna(qmin) and pd.notna(qmax) and qmin >= 0 and qmax <= 3:
        df['quarter'] = df['quarter'] + 1

# ---------------------------------------------------------
# STEP 1: HEALTH DATA AGGREGATION (County-Year -> State-Year)
#   IMPORTANT: do NOT "sum" health across duplicate industry/quarter rows.
#   First, collapse to unique county-year with FIRST (or median).
# ---------------------------------------------------------

health_keep = ['State', 'Year', 'FIPS', 'Population'] + health_count_cols + health_rate_cols
county_health_raw = df[health_keep].copy()

# Robust de-dup: collapse to one row per county-year
# Use 'first' since values should be identical across duplicated rows.
county_health = (
    county_health_raw
    .sort_values(['State', 'FIPS', 'Year'])
    .groupby(['State', 'Year', 'FIPS'], as_index=False)
    .agg({**{'Population': 'first'},
          **{c: 'first' for c in health_count_cols},
          **{c: 'first' for c in health_rate_cols}})
)

# Create numerators for weighted averages
for col in health_rate_cols:
    county_health[f'{col}_num'] = county_health[col] * county_health['Population']

# Aggregate to state-year
agg_dict = {
    'state_total_pop': ('Population', 'sum'),
}

for col in health_count_cols:
    agg_dict[f'STATE_{col}'] = (col, 'sum')

for col in health_rate_cols:
    agg_dict[f'{col}_num_sum'] = (f'{col}_num', 'sum')

state_health = county_health.groupby(['State', 'Year'], as_index=False).agg(**agg_dict)

# Final pop-weighted rates
for col in health_rate_cols:
    state_health[f'STATE_{col}'] = state_health[f'{col}_num_sum'] / state_health['state_total_pop']
    state_health.drop(columns=[f'{col}_num_sum'], inplace=True)

# ---------------------------------------------------------
# STEP 2: ECON DATA AGGREGATION (County-Quarter-Industry -> State-Year)
#   Key choices for annual:
#   - hires: SUM over quarters (flow)
#   - employment: MEAN over quarters (level/stock)
#   - earnings: employment-weighted across all quarters (person-quarter weighted)
# ---------------------------------------------------------

# Exclude industry '00' to avoid double counting totals
df_econ = df[df['industry'] != '00'].copy()

# Wage bill for correct aggregation of average earnings
df_econ['wage_bill'] = df_econ['Earnings_Beginning_Qtr'] * df_econ['Employment_Count']

# 2A) First build State-Quarter totals (summing across counties + industries)
state_qtr = (
    df_econ.groupby(['State', 'Year', 'quarter'], as_index=False)
    .agg(
        state_emp_qtr=('Employment_Count', 'sum'),
        state_hires_qtr=('New_Hires', 'sum'),
        state_wage_bill_qtr=('wage_bill', 'sum')
    )
)

# Quarter-level avg earnings (optional)
state_qtr['state_avg_earnings_qtr'] = state_qtr['state_wage_bill_qtr'] / state_qtr['state_emp_qtr'].replace({0: np.nan})

# 2B) Now aggregate State-Quarter -> State-Year
state_annual = (
    state_qtr.groupby(['State', 'Year'], as_index=False)
    .agg(
        # employment level: average across quarters
        state_emp_avg=('state_emp_qtr', 'mean'),
        # also keep sum across quarters (person-quarters), useful for weighting
        state_emp_sum=('state_emp_qtr', 'sum'),

        # hires: flow total in the year
        state_hires_total=('state_hires_qtr', 'sum'),

        # wage bill total in the year (sum of person-quarter wage bills)
        state_wage_bill_total=('state_wage_bill_qtr', 'sum'),

        # optional: average quarterly earnings level (simple average of quarter avg)
        state_avg_earnings_meanq=('state_avg_earnings_qtr', 'mean'),
    )
)

# Annual employment-weighted earnings across quarters (recommended)
# Interpretation: average earnings per worker-quarter within the year
state_annual['state_avg_earnings'] = state_annual['state_wage_bill_total'] / state_annual['state_emp_sum'].replace({0: np.nan})

# ---------------------------------------------------------
# STEP 3: MERGE Annual Econ + Annual Health
# ---------------------------------------------------------
state_df = state_annual.merge(state_health, on=['State', 'Year'], how='left')

# ---------------------------------------------------------
# STEP 4: ANNUAL FEATURE ENGINEERING
# ---------------------------------------------------------
pop = state_df['state_total_pop'].replace({0: np.nan})
emp_avg = state_df['state_emp_avg'].replace({0: np.nan})

# Per-capita / rates
state_df['econ_emp_per_1k'] = 1000 * state_df['state_emp_avg'] / pop
state_df['econ_hires_per_1k'] = 1000 * state_df['state_hires_total'] / pop

# Annual hires per worker-year (uses avg employment level)
state_df['econ_hire_rate_annual'] = state_df['state_hires_total'] / emp_avg

# Optional: YoY growth on annual series (lag 1 year)
state_df = state_df.sort_values(['State', 'Year']).copy()

def growth_yoy(x):
    prev = x.shift(1)
    return (x - prev) / prev.replace({0: np.nan})

state_df['growth_emp_yoy'] = state_df.groupby('State')['state_emp_avg'].transform(growth_yoy)
state_df['growth_earn_yoy'] = state_df.groupby('State')['state_avg_earnings'].transform(growth_yoy)
state_df['growth_hires_yoy'] = state_df.groupby('State')['state_hires_total'].transform(growth_yoy)

# Cleanup
state_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# ---------------------------------------------------------
# STEP 5: FINAL OUTPUT
# ---------------------------------------------------------
base_cols = [
    'State', 'Year',
    'state_total_pop',
    # Econ (annual)
    'state_emp_avg', 'state_emp_sum', 'state_hires_total',
    'state_avg_earnings', 'state_avg_earnings_meanq',
    'econ_emp_per_1k', 'econ_hires_per_1k', 'econ_hire_rate_annual',
    'growth_emp_yoy', 'growth_earn_yoy', 'growth_hires_yoy'
]

health_cols_out = [c for c in state_df.columns if c.startswith('STATE_')]

final_cols = base_cols + health_cols_out
state_df_annual_final = state_df[final_cols].copy()

print(f"Annual Aggregated DataFrame Shape: {state_df_annual_final.shape}")
print("Sample Columns:", state_df_annual_final.columns.tolist()[:12])


Annual Aggregated DataFrame Shape: (539, 40)
Sample Columns: ['State', 'Year', 'state_total_pop', 'state_emp_avg', 'state_emp_sum', 'state_hires_total', 'state_avg_earnings', 'state_avg_earnings_meanq', 'econ_emp_per_1k', 'econ_hires_per_1k', 'econ_hire_rate_annual', 'growth_emp_yoy']


In [23]:
state_df_annual_final

Unnamed: 0,State,Year,state_total_pop,state_emp_avg,state_emp_sum,state_hires_total,state_avg_earnings,state_avg_earnings_meanq,econ_emp_per_1k,econ_hires_per_1k,...,STATE_% With Access to Exercise Opportunities,STATE_Food Environment Index,STATE_Income Ratio,STATE_Mentally Unhealthy Days,STATE_Physically Unhealthy Days,STATE_Preventable Hospitalization Rate,STATE_Primary Care Physicians Rate,STATE_Social Association Rate,STATE_Teen Birth Rate,STATE_Violent Crime Rate
0,Alabama,2014,4843737.0,1498318.50,5993274.0,1096661.0,3228.483924,3228.168019,309.331101,226.408040,...,51.935239,6.932836,0.000000,4.254047,4.289236,0.000000,62.132991,0.000000,48.186843,410.745302
1,Alabama,2015,4854803.0,1525839.00,6103356.0,1161013.0,3296.603814,3296.053334,314.294730,239.147294,...,64.355654,6.694086,5.030240,4.252503,4.285823,69.088952,62.883135,12.483551,46.968524,410.093125
2,Alabama,2016,4866824.0,1550592.00,6202368.0,1216125.0,3333.740663,3333.279657,318.604494,249.880620,...,63.141529,6.630406,5.064997,4.613115,4.642889,63.233968,63.920448,12.495976,44.568666,409.633102
3,Alabama,2017,4877989.0,1568279.25,6273117.0,1256587.0,3396.834061,3396.809367,321.501186,257.603492,...,0.000000,6.505342,5.082714,4.346475,4.317696,59.030987,64.602109,12.411106,41.759701,430.881423
4,Alabama,2018,4891628.0,1591449.75,6365799.0,1343611.0,3499.525725,3499.332000,325.341533,274.675629,...,63.172629,6.945420,5.107864,4.469942,4.378239,59.751012,65.327281,12.249050,35.987034,430.315343
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
534,Wyoming,2020,570864.0,197677.50,790710.0,184565.0,3836.381008,3834.619197,346.277747,323.308178,...,76.782077,7.687710,4.166759,3.515226,3.366822,3369.450393,72.041549,11.824352,28.210169,187.296452
535,Wyoming,2021,572889.0,201160.75,804643.0,201699.0,3986.156927,3981.600173,351.133902,352.073438,...,76.801626,7.649759,4.126810,3.959356,3.694561,3378.711441,68.743140,12.110035,25.969930,186.751076
536,Wyoming,2022,575106.0,207544.50,830178.0,206479.0,4295.698368,4292.581068,360.880429,359.027727,...,73.810856,7.765695,4.213440,4.001026,3.506198,3096.947537,72.137204,12.021146,24.283107,186.710976
537,Wyoming,2023,578239.0,212949.25,851797.0,198793.0,4510.154996,4510.339141,368.272029,343.790370,...,77.638870,7.779140,4.235807,4.019662,2.849604,2324.557970,70.486996,11.973542,24.269712,0.000000


In [24]:
state_df_annual_final.to_csv("CH_ECON_V4.csv")

In [25]:

# ---------------------------------------------------------
# SETUP: Define your column lists carefully
# ---------------------------------------------------------

# 1. Variables that are RAW COUNTS -> We will SUM these
# Note: I included 'Premature Deaths' here assuming it is a count. 
# If it is a rate (YPLL Rate), move it to the rate_cols list.
health_count_cols = [
    '# Alcohol-Impaired Driving Deaths', 
    '# Driving Deaths',
    'Premature Deaths' 
]

# 2. Variables that are RATES / INDICES / PERCENTS -> We will WEIGHTED AVERAGE these
health_rate_cols = [
    '% Adults with Obesity', 
    '% Children in Poverty',
    '% Children in Single-Parent Households', 
    '% Drive Alone to Work',
    '% Excessive Drinking', 
    '% Fair or Poor Health',
    '% Long Commute - Drives Alone', 
    '% Low Birthweight',
    '% Severe Housing Problems', 
    '% Smokers', 
    '% Some College',
    '% Unemployed', 
    '% Uninsured',
    '% With Access to Exercise Opportunities', 
    'Food Environment Index',
    'Income Ratio', 
    'Mentally Unhealthy Days', 
    'Physically Unhealthy Days',
    'Preventable Hospitalization Rate',
    'Primary Care Physicians Rate', 
    'Social Association Rate',
    'Teen Birth Rate', 
    'Violent Crime Rate'
]

# 3. Numeric columns to coerce (Standard housekeeping)
numeric_cols = [
    'Year', 'quarter', 'FIPS', 'Population',
    'Employment_Count', 'New_Hires', 'Earnings_Beginning_Qtr'
] + health_count_cols + health_rate_cols

# ---------------------------------------------------------
# STEP 0: PRE-PROCESSING
# ---------------------------------------------------------
df = merged_full.copy()

# Coerce numeric types
for c in numeric_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')

# Filter Year Window
df = df[(df['Year'] >= 2014) & (df['Year'] <= 2024)].copy()

# ---------------------------------------------------------
# STEP 1: HEALTH DATA AGGREGATION (Annual)
# ---------------------------------------------------------

# Create a clean County-Year dataset
# We drop duplicates because the original df has many rows (industries) per county
county_health = df[['State', 'Year', 'FIPS', 'Population'] + health_count_cols + health_rate_cols].drop_duplicates()

# --- A. Handle Weighted Averages (Rates) ---
# Create numerators: (Rate * Population)
for col in health_rate_cols:
    county_health[f'{col}_numerator'] = county_health[col] * county_health['Population']

# Define aggregation dictionary
agg_dict = {
    'state_total_pop': ('Population', 'sum'),
}

# Add Sum logic for Count columns
for col in health_count_cols:
    agg_dict[f'STATE_{col}'] = (col, 'sum')

# Add Sum logic for Rate Numerators
for col in health_rate_cols:
    agg_dict[f'{col}_num_sum'] = (f'{col}_numerator', 'sum')

# perform GroupBy
state_health = county_health.groupby(['State', 'Year'], as_index=False).agg(**agg_dict)

# Calculate Final Weighted Averages for Rates
for col in health_rate_cols:
    # Sum of (Rate*Pop) / Total Pop
    state_health[f'STATE_{col}'] = state_health[f'{col}_num_sum'] / state_health['state_total_pop']
    # Drop the temporary numerator column to keep it clean
    state_health.drop(columns=[f'{col}_num_sum'], inplace=True)

# ---------------------------------------------------------
# STEP 2: ECONOMIC DATA AGGREGATION (Quarterly)
# ---------------------------------------------------------

# CRITICAL: Filter out Industry '00' to avoid double counting
# We sum the specific industries to get the State Total
df_econ = df[df['industry'] != '00'].copy()

# Weight Earnings by Employment (because it's an average per person)
df_econ['wage_bill'] = df_econ['Earnings_Beginning_Qtr'] * df_econ['Employment_Count']

state_qtr = (
    df_econ.groupby(['State', 'Year', 'quarter'], as_index=False)
    .agg(
        state_emp_total=('Employment_Count', 'sum'),
        state_hires_total=('New_Hires', 'sum'),
        state_wage_bill_total=('wage_bill', 'sum')
    )
)

# Recover State Average Earnings
state_qtr['state_avg_earnings'] = state_qtr['state_wage_bill_total'] / state_qtr['state_emp_total']

# ---------------------------------------------------------
# STEP 3: MERGE & FEATURE ENGINEERING
# ---------------------------------------------------------

# Merge Annual Health into Quarterly Econ
state_df = state_qtr.merge(state_health, on=['State', 'Year'], how='left')

# Helper variables
pop = state_df['state_total_pop'].replace({0: np.nan})
emp = state_df['state_emp_total'].replace({0: np.nan})

# Per Capita Econ Metrics
state_df['econ_emp_per_1k'] = 1000 * state_df['state_emp_total'] / pop
state_df['econ_hires_per_1k'] = 1000 * state_df['state_hires_total'] / pop
state_df['econ_hire_rate'] = state_df['state_hires_total'] / emp

# Growth Rates (Lagged features)
state_df = state_df.sort_values(['State', 'Year', 'quarter'])

def calc_growth(series, lag):
    prev = series.shift(lag)
    den = prev.replace({0: np.nan})
    return (series - prev) / den

# QoQ Growth
state_df['growth_emp_qoq'] = state_df.groupby('State')['state_emp_total'].transform(lambda x: calc_growth(x, 1))
state_df['growth_earn_qoq'] = state_df.groupby('State')['state_avg_earnings'].transform(lambda x: calc_growth(x, 1))

# YoY Growth (Seasonality adjustment)
state_df['growth_emp_yoy'] = state_df.groupby('State')['state_emp_total'].transform(lambda x: calc_growth(x, 4))

# ---------------------------------------------------------
# STEP 4: CLEANUP & FINAL OUTPUT
# ---------------------------------------------------------
state_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Organize columns clearly
final_cols = [
    'State', 'Year', 'quarter', 'state_total_pop',
    # Econ
    'state_emp_total', 'state_hires_total', 'state_avg_earnings',
    'econ_emp_per_1k', 'econ_hires_per_1k', 'econ_hire_rate',
    'growth_emp_qoq', 'growth_earn_qoq', 'growth_emp_yoy'
] 
# Add all the State Health Columns (Counts and Rates)
# (They are already named STATE_... in the dataframe)
health_final_cols = [c for c in state_df.columns if c.startswith('STATE_')]
final_cols = final_cols + health_final_cols

# Final Selection
state_df_final = state_df[final_cols]

print(f"Aggregated DataFrame Shape: {state_df_final.shape}")
print("Sample Columns:", state_df_final.columns.tolist()[:10])

Aggregated DataFrame Shape: (2151, 39)
Sample Columns: ['State', 'Year', 'quarter', 'state_total_pop', 'state_emp_total', 'state_hires_total', 'state_avg_earnings', 'econ_emp_per_1k', 'econ_hires_per_1k', 'econ_hire_rate']


In [26]:
state_df_final

Unnamed: 0,State,Year,quarter,state_total_pop,state_emp_total,state_hires_total,state_avg_earnings,econ_emp_per_1k,econ_hires_per_1k,econ_hire_rate,...,STATE_% With Access to Exercise Opportunities,STATE_Food Environment Index,STATE_Income Ratio,STATE_Mentally Unhealthy Days,STATE_Physically Unhealthy Days,STATE_Preventable Hospitalization Rate,STATE_Primary Care Physicians Rate,STATE_Social Association Rate,STATE_Teen Birth Rate,STATE_Violent Crime Rate
0,Alabama,2014,1,4843737.0,1484428.0,234866.0,3183.492990,306.463377,48.488595,0.158220,...,51.935239,6.932836,0.000000,4.254047,4.289236,0.000000,62.132991,0.000000,48.186843,410.745302
1,Alabama,2014,2,4843737.0,1488384.0,296168.0,3186.477239,307.280102,61.144525,0.198986,...,51.935239,6.932836,0.000000,4.254047,4.289236,0.000000,62.132991,0.000000,48.186843,410.745302
2,Alabama,2014,3,4843737.0,1510987.0,292385.0,3158.744593,311.946540,60.363517,0.193506,...,51.935239,6.932836,0.000000,4.254047,4.289236,0.000000,62.132991,0.000000,48.186843,410.745302
3,Alabama,2014,4,4843737.0,1509475.0,273242.0,3383.957254,311.634385,56.411403,0.181018,...,51.935239,6.932836,0.000000,4.254047,4.289236,0.000000,62.132991,0.000000,48.186843,410.745302
4,Alabama,2015,1,4854803.0,1510048.0,246062.0,3224.297608,311.042075,50.684240,0.162950,...,64.355654,6.694086,5.030240,4.252503,4.285823,69.088952,62.883135,12.483551,46.968524,410.093125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2146,Wyoming,2023,4,578239.0,215710.0,45344.0,4717.370878,373.046439,78.417402,0.210208,...,77.638870,7.779140,4.235807,4.019662,2.849604,2324.557970,70.486996,11.973542,24.269712,0.000000
2147,Wyoming,2024,1,580752.0,208388.0,36781.0,4665.328800,358.824421,63.333402,0.176502,...,77.761949,7.781328,4.259097,4.750646,3.437344,2178.144266,70.319097,12.253024,20.177949,0.000000
2148,Wyoming,2024,2,580752.0,208453.0,59642.0,4530.285609,358.936345,102.697881,0.286117,...,77.761949,7.781328,4.259097,4.750646,3.437344,2178.144266,70.319097,12.253024,20.177949,0.000000
2149,Wyoming,2024,3,580752.0,222565.0,51118.0,4574.083715,383.235873,88.020360,0.229677,...,77.761949,7.781328,4.259097,4.750646,3.437344,2178.144266,70.319097,12.253024,20.177949,0.000000


In [27]:
output_path = r"C:\Users\liuc\Downloads\CH_ECON_V3.csv"  # change if needed
#state_df_final.to_csv(output_path, index=False)

In [28]:
# ============================================
# US State Choropleth (Using state_df_final)
# ============================================

import pandas as pd
import plotly.express as px
from ipywidgets import widgets, interactive_output
from IPython.display import display

# 1. Prepare the Data
# --------------------------------------------
# We group by State/Year and take the MEAN.
# - For Health data (Annual): The value is constant across quarters, so Mean returns the value.
# - For Econ data (Quarterly): This gives us the "Average Quarterly" level for that year.
df_map = state_df_final.groupby(['State', 'Year'], as_index=False).mean(numeric_only=True)

# 2. Map State Names to USPS Codes
# --------------------------------------------
state_to_code = {
    'Alabama':'AL','Alaska':'AK','Arizona':'AZ','Arkansas':'AR','California':'CA','Colorado':'CO',
    'Connecticut':'CT','Delaware':'DE','District of Columbia':'DC','Florida':'FL','Georgia':'GA',
    'Hawaii':'HI','Idaho':'ID','Illinois':'IL','Indiana':'IN','Iowa':'IA','Kansas':'KS','Kentucky':'KY',
    'Louisiana':'LA','Maine':'ME','Maryland':'MD','Massachusetts':'MA','Michigan':'MI','Minnesota':'MN',
    'Mississippi':'MS','Missouri':'MO','Montana':'MT','Nebraska':'NE','Nevada':'NV','New Hampshire':'NH',
    'New Jersey':'NJ','New Mexico':'NM','New York':'NY','North Carolina':'NC','North Dakota':'ND',
    'Ohio':'OH','Oklahoma':'OK','Oregon':'OR','Pennsylvania':'PA','Rhode Island':'RI','South Carolina':'SC',
    'South Dakota':'SD','Tennessee':'TN','Texas':'TX','Utah':'UT','Vermont':'VT','Virginia':'VA',
    'Washington':'WA','West Virginia':'WV','Wisconsin':'WI','Wyoming':'WY'
}

df_map['code'] = df_map['State'].map(state_to_code)

# 3. Define Columns for Dropdown
# --------------------------------------------
# We dynamically pull the columns available in your dataframe
# to ensure the dropdown never breaks.
exclude_cols = ['State', 'Year', 'quarter', 'code', 'state_total_pop']
available_cols = [c for c in df_map.columns if c not in exclude_cols]
available_cols.sort()

# Set a smart default
default_val = 'econ_emp_per_1k' if 'econ_emp_per_1k' in available_cols else available_cols[0]

# 4. Widgets
# --------------------------------------------
var_dd = widgets.Dropdown(
    options=available_cols,
    value=default_val,
    description='Variable:',
    layout=widgets.Layout(width='450px')
)

yrs = sorted(df_map['Year'].unique())
year_dd = widgets.Dropdown(
    options=yrs,
    value=yrs[-1], # Default to latest year
    description='Year:',
    layout=widgets.Layout(width='200px')
)

# 5. Custom Colorscale (Your Preferred Yellow-Gold)
# --------------------------------------------
ylw_scale = [
    (0.00, "#fffde7"),
    (0.33, "#fff59d"),
    (0.66, "#fdd835"),
    (1.00, "#f9a825")
]

# 6. Plot Function
# --------------------------------------------
def show_map(variable, year):
    # Filter data for specific year
    d = df_map[df_map['Year'] == year].copy()

    # Dynamic Range calculation for better contrast
    # (Avoids 0s or NaNs skewing the scale)
    valid_values = d[variable].dropna()
    if len(valid_values) > 0:
        low = valid_values.quantile(0.05)
        high = valid_values.quantile(0.95)
        rc = [low, high]
    else:
        rc = None

    fig = px.choropleth(
        d,
        locations="code",
        locationmode="USA-states",
        color=variable,
        scope="usa",
        range_color=rc,
        color_continuous_scale=ylw_scale,
        hover_name="State",
        hover_data={'code':False, 'Year':True, variable:':.2f'},
        labels={variable: variable.replace('_',' ').replace('STATE', '').title()}
    )

    fig.update_layout(
        title=dict(
            text=f"US States â€” {variable.replace('_',' ').title()} ({year})",
            x=0.5,
            xanchor='center'
        ),
        coloraxis_colorbar=dict(title="Value"),
        geo=dict(bgcolor='rgba(0,0,0,0)'), # Transparent geo background
        width=1050,
        height=600,
        margin=dict(l=0,r=0,t=60,b=0)
    )

    fig.update_traces(marker_line_color="white", marker_line_width=0.5)
    fig.show()

# 7. Display UI
# --------------------------------------------
ui = widgets.HBox([var_dd, year_dd])
out = interactive_output(show_map, {'variable': var_dd, 'year': year_dd})

display(ui, out)

HBox(children=(Dropdown(description='Variable:', index=26, layout=Layout(width='450px'), options=('STATE_# Alcâ€¦

Output()

In [39]:
# ========================================================
# Interactive State Trend Plot (2014â€“2024)
# Using pre-aggregated 'state_df_final'
# ========================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ipywidgets import widgets, interactive_output
from IPython.display import display

# --------------------------------------
# 1) Prepare Annual Data
# --------------------------------------
# We start with state_df_final (Quarterly) and collapse to Annual.
# Logic:
# - Health columns are constant for the year, so mean() returns the correct value.
# - Econ columns are quarterly, so mean() gives the "Average Quarterly Level" for that year.
df_trend = state_df_final.groupby(['State', 'Year'], as_index=False).mean(numeric_only=True)

# Ensure full 2014-2024 range for every state (handling missing years)
def _complete_years(g):
    # Create reference index
    all_years = pd.DataFrame({'Year': np.arange(2014, 2025)})
    # Merge existing data onto it
    g = all_years.merge(g, on='Year', how='left')
    # Fill State name downwards and upwards
    g['State'] = g['State'].ffill().bfill()
    return g

df_trend = (
    df_trend.groupby('State', as_index=False, group_keys=False)
    .apply(_complete_years)
    .sort_values(['State', 'Year'])
)

# --------------------------------------
# 2) Define Variables for Dropdowns
# --------------------------------------
exclude_cols = ['State', 'Year', 'quarter', 'state_total_pop']
# Get all numeric columns except the excluded ones
all_vars = sorted([c for c in df_trend.columns if c not in exclude_cols])

state_options = sorted(df_trend['State'].dropna().unique().tolist())

# --------------------------------------
# 3) Widgets
# --------------------------------------
state_dd = widgets.Dropdown(
    options=state_options,
    value=state_options[0] if state_options else None,
    description='State:',
    layout=widgets.Layout(width='250px')
)

var1_dd = widgets.Dropdown(
    options=all_vars,
    value='econ_emp_per_1k' if 'econ_emp_per_1k' in all_vars else all_vars[0],
    description='Variable 1:',
    layout=widgets.Layout(width='400px')
)

# Try to find a good default for Var 2 (e.g., Obesity)
default_v2 = [v for v in all_vars if 'Obesity' in v]
default_v2 = default_v2[0] if default_v2 else (all_vars[1] if len(all_vars) > 1 else all_vars[0])

var2_dd = widgets.Dropdown(
    options=all_vars,
    value=default_v2,
    description='Variable 2:',
    layout=widgets.Layout(width='400px')
)

normalize_cb = widgets.Checkbox(
    value=False,
    description='Normalize (0â€“1)',
    indent=False
)

# --------------------------------------
# 4) Plotting Function
# --------------------------------------
def plot_state_trends(state, var1, var2, normalize):
    if not state or not var1 or not var2:
        return

    # Filter data
    sub = df_trend[df_trend['State'] == state].copy()
    
    x = sub['Year']
    y1 = sub[var1]
    y2 = sub[var2]

    # --- Normalization Logic ---
    def _minmax(s):
        mn, mx = s.min(), s.max()
        if pd.isna(mn) or pd.isna(mx) or mx == mn:
            return s
        return (s - mn) / (mx - mn)

    if normalize:
        y1_plot = _minmax(y1)
        y2_plot = _minmax(y2)
        y1_lbl = f"{var1} (Scaled)"
        y2_lbl = f"{var2} (Scaled)"
    else:
        y1_plot, y2_plot = y1, y2
        y1_lbl = var1
        y2_lbl = var2

    # --- Plotting ---
    fig, ax1 = plt.subplots(figsize=(10, 5))
    
    # Style 1
    color1 = "#1f77b4" # Tab:Blue
    line1 = ax1.plot(x, y1_plot, marker='o', linestyle='-', linewidth=2, color=color1, label=y1_lbl)
    ax1.set_xlabel("Year", fontsize=10)
    ax1.set_ylabel(y1_lbl, color=color1, fontsize=10, fontweight='bold')
    ax1.tick_params(axis='y', labelcolor=color1)
    ax1.grid(True, linestyle='--', alpha=0.5)

    # Style 2 (Twin Axis)
    ax2 = ax1.twinx()
    color2 = "#ff7f0e" # Tab:Orange
    line2 = ax2.plot(x, y2_plot, marker='s', linestyle='--', linewidth=2, color=color2, label=y2_lbl)
    ax2.set_ylabel(y2_lbl, color=color2, fontsize=10, fontweight='bold')
    ax2.tick_params(axis='y', labelcolor=color2)

    # Title
    norm_txt = " (Normalized Trend)" if normalize else ""
    plt.title(f"{state}: {var1} vs. {var2}{norm_txt}", fontsize=12)
    plt.xticks(np.arange(2014, 2025, 1))

    # Unified Legend
    lines = line1 + line2
    labels = [l.get_label() for l in lines]
    ax1.legend(lines, labels, loc='upper left', frameon=True)

    plt.tight_layout()
    plt.show()

# --------------------------------------
# 5) Display
# --------------------------------------
ui = widgets.VBox([
    widgets.HBox([state_dd, normalize_cb]),
    widgets.HBox([var1_dd, var2_dd])
])

out = interactive_output(
    plot_state_trends,
    {'state': state_dd, 'var1': var1_dd, 'var2': var2_dd, 'normalize': normalize_cb}
)

display(ui, out)





VBox(children=(HBox(children=(Dropdown(description='State:', layout=Layout(width='250px'), options=('Alabama',â€¦

Output()