In [1]:
# ===============================
# 1. Setup and Load BLS Data
# ===============================

import pandas as pd
import os

# Load the BLS CSV file
df = pd.read_csv("bls_mi/Michigan_county_employment.csv")

print("Shape of dataset:", df.shape)
print("Columns:", df.columns.tolist())
print("\nInfo:")
print(df.info())
display(df.head())

# Standardize column names
df = df.rename(columns={"county": "county_name", "year": "survey_year"})

# Collapse monthly values to yearly summaries
bls_summary = (
    df.groupby(["county_name", "survey_year"])
    .agg({
        "employment": ["mean", "min", "max"],
        "unemployment": ["mean", "min", "max"],
        "labor_force": ["mean", "min", "max"],
        "unemployment_rate": ["mean", "min", "max"]
    })
    .reset_index()
)

# Flatten hierarchical columns
bls_summary.columns = ["_".join(c).strip("_") for c in bls_summary.columns.values]

print("BLS summary shape:", bls_summary.shape)
display(bls_summary.head())

Shape of dataset: (37848, 7)
Columns: ['county', 'year', 'month', 'unemployment_rate', 'labor_force', 'employment', 'unemployment']

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37848 entries, 0 to 37847
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   county             37848 non-null  object 
 1   year               37848 non-null  int64  
 2   month              37848 non-null  int64  
 3   unemployment_rate  37848 non-null  float64
 4   labor_force        37848 non-null  float64
 5   employment         37848 non-null  float64
 6   unemployment       37848 non-null  float64
dtypes: float64(4), int64(2), object(1)
memory usage: 2.0+ MB
None


Unnamed: 0,county,year,month,unemployment_rate,labor_force,employment,unemployment
0,"Alcona County, MI",1990,1,16.6,636.0,3192.0,3828.0
1,"Alcona County, MI",1990,2,16.2,611.0,3171.0,3782.0
2,"Alcona County, MI",1990,3,15.8,608.0,3246.0,3854.0
3,"Alcona County, MI",1990,4,14.7,579.0,3369.0,3948.0
4,"Alcona County, MI",1990,5,11.7,468.0,3524.0,3992.0


BLS summary shape: (2988, 14)


Unnamed: 0,county_name,survey_year,employment_mean,employment_min,employment_max,unemployment_mean,unemployment_min,unemployment_max,labor_force_mean,labor_force_min,labor_force_max,unemployment_rate_mean,unemployment_rate_min,unemployment_rate_max
0,"Alcona County, MI",1990,3420.538462,3171.0,3662.0,3933.923077,3782.0,4190.0,513.384615,382.0,636.0,13.084615,9.8,16.6
1,"Alcona County, MI",1991,3368.846154,3112.0,3643.0,3937.923077,3770.0,4183.0,569.076923,342.0,739.0,14.438462,9.0,19.1
2,"Alcona County, MI",1992,3473.307692,3242.0,3712.0,4138.692308,3985.0,4428.0,665.384615,493.0,805.0,16.092308,11.8,19.6
3,"Alcona County, MI",1993,3870.153846,3683.0,4118.0,4493.307692,4347.0,4721.0,623.153846,525.0,737.0,13.892308,11.4,16.7
4,"Alcona County, MI",1994,4276.846154,3845.0,4591.0,4896.538462,4659.0,5205.0,619.692308,446.0,814.0,12.723077,9.2,17.5


In [24]:
# ===============================
# 2. Load and Clean CDC Health Core Data
# ===============================

data_dir = "cdc_mi"
files = [f for f in os.listdir(data_dir) if f.endswith(".csv")]

dfs = []
for file in files:
    file_path = os.path.join(data_dir, file)
    df = pd.read_csv(file_path)
    
    # Keep only California rows
    if "state_name" in df.columns:
        df = df[df["state_name"].str.contains("Michigan", case=False, na=False)]
    elif "_state" in df.columns:
        df = df[df["_state"] == 6]  # Michigan Fips
    
    # Restrict years
    if "survey_year" in df.columns:
        df = df[(df["survey_year"] >= 1993) & (df["survey_year"] <= 2010)]
    
    dfs.append(df)

# Combine all years
health_core_data = pd.concat(dfs, ignore_index=True)

# Standardize county_name → "County, ST" format
health_core_data["county_name"] = (
    health_core_data["county_name"].str.strip() + ", " +
    health_core_data["state_name"].str.strip().str[:2].str.upper()
)


print("Health core shape:", health_core_data.shape)
print("Columns:", health_core_data.columns.tolist()[:12], "...")
print("Earliest year:", health_core_data["survey_year"].min())
print("Latest year:", health_core_data["survey_year"].max())
display(health_core_data.head())

Health core shape: (34708, 25)
Columns: ['county_name', 'state_name', 'survey_year', 'respondent_age', 'respondent_sex', 'marital_status', 'education_level', 'employment_status', 'household_income_category', 'num_adults_in_household', 'num_men_in_household', 'num_women_in_household'] ...
Earliest year: 2010
Latest year: 2010


Unnamed: 0,county_name,state_name,survey_year,respondent_age,respondent_sex,marital_status,education_level,employment_status,household_income_category,num_adults_in_household,...,days_physical_health_not_good,days_mental_health_not_good,days_poor_health_limited_activities,smoked_100_cigarettes,county_code,age_group_5yr,age65plus_flag,currently_has_asthma,ever_had_mammogram,teeth_cleaned_by_dentist
0,"Allegan County, MI",Michigan,2010,57.0,1.0,1.0,6.0,7.0,5.0,2.0,...,,,,,,,,,,
1,"Allegan County, MI",Michigan,2010,48.0,1.0,1.0,4.0,2.0,6.0,2.0,...,,,,,,,,,,
2,"Allegan County, MI",Michigan,2010,83.0,2.0,5.0,6.0,7.0,6.0,2.0,...,,,,,,,,,,
3,"Allegan County, MI",Michigan,2010,49.0,1.0,2.0,4.0,1.0,7.0,1.0,...,,,,,,,,,,
4,"Allegan County, MI",Michigan,2010,60.0,2.0,1.0,6.0,1.0,8.0,2.0,...,,,,,,,,,,


In [32]:
data_columns = health_core_data.columns.tolist
print(data_columns)

<bound method IndexOpsMixin.tolist of Index(['county_name', 'state_name', 'survey_year', 'respondent_age',
       'respondent_sex', 'marital_status', 'education_level',
       'employment_status', 'household_income_category',
       'num_adults_in_household', 'num_men_in_household',
       'num_women_in_household', 'has_any_health_plan',
       'could_not_see_doctor_due_to_cost', 'general_health_status',
       'days_physical_health_not_good', 'days_mental_health_not_good',
       'days_poor_health_limited_activities', 'smoked_100_cigarettes',
       'county_code', 'age_group_5yr', 'age65plus_flag',
       'currently_has_asthma', 'ever_had_mammogram',
       'teeth_cleaned_by_dentist'],
      dtype='object')>


In [34]:
# ===============================
# 3. Categorical Percent Breakdowns
# ===============================

cat_vars_mi = [
    # demographics
    "respondent_sex",
    "marital_status",
    "education_level",
    "employment_status",
    "household_income_category",
    "age_group_5yr",
    "age65plus_flag",

    # health coverage / access
    "has_any_health_plan",
    "could_not_see_doctor_due_to_cost",

    # health status / behaviors
    "general_health_status",
    "smoked_100_cigarettes",
    "currently_has_asthma",
    "ever_had_mammogram",
    "teeth_cleaned_by_dentist"
]

percent_dfs = []

for col in cat_vars_mi:
    temp = (
        health_core_data
        .groupby(["county_name", "survey_year", col])
        .size()
        .reset_index(name="count")
    )
    
    temp["percent"] = (
        temp.groupby(["county_name", "survey_year"])["count"]
        .transform(lambda x: 100 * x / x.sum())
    )
    
    temp["variable"] = col
    temp.rename(columns={col: "category"}, inplace=True)
    temp = temp[["county_name", "survey_year", "variable", "category", "percent"]]
    
    percent_dfs.append(temp)

percent_breakdowns = pd.concat(percent_dfs, ignore_index=True)

print("Percent breakdowns shape:", percent_breakdowns.shape)
display(percent_breakdowns.head(20))

Percent breakdowns shape: (4429, 5)


Unnamed: 0,county_name,survey_year,variable,category,percent
0,"Allegan County, MI",2010,respondent_sex,1.0,43.809524
1,"Allegan County, MI",2010,respondent_sex,2.0,56.190476
2,"Alpena County, MI",2010,respondent_sex,1.0,32.432432
3,"Alpena County, MI",2010,respondent_sex,2.0,67.567568
4,"Antrim County, MI",2010,respondent_sex,1.0,42.857143
5,"Antrim County, MI",2010,respondent_sex,2.0,57.142857
6,"Arenac County, MI",2010,respondent_sex,1.0,50.0
7,"Arenac County, MI",2010,respondent_sex,2.0,50.0
8,"Barry County, MI",2010,respondent_sex,1.0,58.974359
9,"Barry County, MI",2010,respondent_sex,2.0,41.025641


In [35]:
# ===============================
# 4. Pivot CDC Data to Wide Format
# ===============================

cdc_wide = (
    percent_breakdowns
    .pivot_table(
        index=["county_name", "survey_year"],
        columns=["variable", "category"],
        values="percent",
        fill_value=0
    )
)

cdc_wide.columns = [
    f"{var}_{int(cat) if isinstance(cat, (int, float)) and float(cat).is_integer() else cat}"
    for var, cat in cdc_wide.columns
]

cdc_wide = cdc_wide.reset_index()

print("CDC wide shape:", cdc_wide.shape)
display(cdc_wide.head())

CDC wide shape: (71, 87)


Unnamed: 0,county_name,survey_year,age65plus_flag_1,age65plus_flag_2,age65plus_flag_3,age_group_5yr_1,age_group_5yr_2,age_group_5yr_3,age_group_5yr_4,age_group_5yr_5,...,smoked_100_cigarettes_2,smoked_100_cigarettes_7,smoked_100_cigarettes_9,teeth_cleaned_by_dentist_1,teeth_cleaned_by_dentist_2,teeth_cleaned_by_dentist_3,teeth_cleaned_by_dentist_4,teeth_cleaned_by_dentist_7,teeth_cleaned_by_dentist_8,teeth_cleaned_by_dentist_9
0,"Allegan County, MI",2010,75.238095,23.809524,0.952381,3.809524,2.857143,4.761905,4.761905,9.52381,...,50.47619,0.0,0.0,73.11828,6.451613,7.526882,9.677419,1.075269,2.150538,0.0
1,"Alpena County, MI",2010,48.648649,48.648649,2.702703,0.0,5.405405,0.0,2.702703,2.702703,...,48.648649,0.0,0.0,75.0,12.5,3.125,3.125,0.0,6.25,0.0
2,"Antrim County, MI",2010,50.0,50.0,0.0,3.571429,7.142857,0.0,3.571429,3.571429,...,28.571429,3.571429,0.0,77.777778,7.407407,3.703704,11.111111,0.0,0.0,0.0
3,"Arenac County, MI",2010,55.555556,44.444444,0.0,0.0,0.0,0.0,5.555556,11.111111,...,38.888889,0.0,0.0,46.666667,26.666667,6.666667,20.0,0.0,0.0,0.0
4,"Barry County, MI",2010,70.512821,29.487179,0.0,6.410256,2.564103,3.846154,3.846154,6.410256,...,43.589744,1.282051,0.0,65.789474,14.473684,11.842105,7.894737,0.0,0.0,0.0


In [37]:
# ===============================
# 5. Apply Readable Labels (Codebook)
# ===============================

codebook_map = {
    "general_health_status": {
        1: "Excellent", 2: "Very good", 3: "Good", 4: "Fair", 5: "Poor",
        7: "Don’t know/Not sure", 9: "Refused"
    },
    "ever_told_high_bp": {
        1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"
    },
    "ever_told_diabetes": {
        1: "Yes", 2: "No", 3: "Yes, during pregnancy", 4: "Pre-diabetes/Borderline",
        7: "Don’t know/Not sure", 9: "Refused"
    },
    "smoked_100_cigarettes": {
        1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"
    },
    "current_smoking_frequency": {
        1: "Every day", 2: "Some days", 3: "Not at all",
        7: "Don’t know/Not sure", 9: "Refused"
    },
    "any_alcohol_past_month": {
        1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"
    },
    "eats_fruit": {
        1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"
    },
    "eats_other_vegetables": {
        1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"
    },
    "any_physical_activity": {
        1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"
    }
}

def decode_food_code(val):
    if val in [1, 2, 7, 9]:
        return {1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"}[val]
    elif 101 <= val <= 199:
        return f"{val - 100} times/day"
    elif 201 <= val <= 299:
        return f"{val - 200} times/week"
    elif 301 <= val <= 399:
        return f"{val - 300} times/month"
    else:
        return f"Code {val}"

rename_map = {}
for col in cdc_wide.columns:
    if col in ["county_name", "survey_year"]:
        continue
    var, cat = col.rsplit("_", 1)
    try:
        cat_int = int(cat)
    except:
        cat_int = cat
    
    if var in codebook_map and cat_int in codebook_map[var]:
        rename_map[col] = f"{var} - {codebook_map[var][cat_int]}"
    elif var in ["eats_fruit", "eats_other_vegetables"]:
        rename_map[col] = f"{var} - {decode_food_code(cat_int)}"
    else:
        rename_map[col] = col

cdc_wide = cdc_wide.rename(columns=rename_map)

print("CDC wide with labels shape:", cdc_wide.shape)
display(cdc_wide.head())

CDC wide with labels shape: (71, 87)


Unnamed: 0,county_name,survey_year,age65plus_flag_1,age65plus_flag_2,age65plus_flag_3,age_group_5yr_1,age_group_5yr_2,age_group_5yr_3,age_group_5yr_4,age_group_5yr_5,...,smoked_100_cigarettes - No,smoked_100_cigarettes - Don’t know/Not sure,smoked_100_cigarettes - Refused,teeth_cleaned_by_dentist_1,teeth_cleaned_by_dentist_2,teeth_cleaned_by_dentist_3,teeth_cleaned_by_dentist_4,teeth_cleaned_by_dentist_7,teeth_cleaned_by_dentist_8,teeth_cleaned_by_dentist_9
0,"Allegan County, MI",2010,75.238095,23.809524,0.952381,3.809524,2.857143,4.761905,4.761905,9.52381,...,50.47619,0.0,0.0,73.11828,6.451613,7.526882,9.677419,1.075269,2.150538,0.0
1,"Alpena County, MI",2010,48.648649,48.648649,2.702703,0.0,5.405405,0.0,2.702703,2.702703,...,48.648649,0.0,0.0,75.0,12.5,3.125,3.125,0.0,6.25,0.0
2,"Antrim County, MI",2010,50.0,50.0,0.0,3.571429,7.142857,0.0,3.571429,3.571429,...,28.571429,3.571429,0.0,77.777778,7.407407,3.703704,11.111111,0.0,0.0,0.0
3,"Arenac County, MI",2010,55.555556,44.444444,0.0,0.0,0.0,0.0,5.555556,11.111111,...,38.888889,0.0,0.0,46.666667,26.666667,6.666667,20.0,0.0,0.0,0.0
4,"Barry County, MI",2010,70.512821,29.487179,0.0,6.410256,2.564103,3.846154,3.846154,6.410256,...,43.589744,1.282051,0.0,65.789474,14.473684,11.842105,7.894737,0.0,0.0,0.0


In [38]:
# ===============================
# 6. Merge CDC Health and BLS Labor Data
# ===============================

merged = pd.merge(
    cdc_wide,
    bls_summary,
    on=["county_name", "survey_year"],
    how="inner"   # inner join: only keep counties/years that appear in both
)

print("Final merged shape:", merged.shape)
display(merged.head())

Final merged shape: (71, 99)


Unnamed: 0,county_name,survey_year,age65plus_flag_1,age65plus_flag_2,age65plus_flag_3,age_group_5yr_1,age_group_5yr_2,age_group_5yr_3,age_group_5yr_4,age_group_5yr_5,...,employment_max,unemployment_mean,unemployment_min,unemployment_max,labor_force_mean,labor_force_min,labor_force_max,unemployment_rate_mean,unemployment_rate_min,unemployment_rate_max
0,"Allegan County, MI",2010,75.238095,23.809524,0.952381,3.809524,2.857143,4.761905,4.761905,9.52381,...,51704.0,56713.769231,55285.0,58340.0,6510.230769,5288.0,7922.0,11.469231,9.5,14.1
1,"Alpena County, MI",2010,48.648649,48.648649,2.702703,0.0,5.405405,0.0,2.702703,2.702703,...,12671.0,14197.307692,13749.0,14607.0,1886.076923,1547.0,2255.0,13.238462,11.1,15.6
2,"Antrim County, MI",2010,50.0,50.0,0.0,3.571429,7.142857,0.0,3.571429,3.571429,...,9863.0,10511.153846,9765.0,11515.0,1697.307692,1390.0,2076.0,16.184615,13.6,20.1
3,"Arenac County, MI",2010,55.555556,44.444444,0.0,0.0,0.0,0.0,5.555556,11.111111,...,6152.0,6924.0,6521.0,7250.0,1154.307692,933.0,1465.0,16.684615,13.8,21.4
4,"Barry County, MI",2010,70.512821,29.487179,0.0,6.410256,2.564103,3.846154,3.846154,6.410256,...,26601.0,29200.0,28750.0,29780.0,3063.230769,2452.0,3769.0,10.484615,8.5,13.0


In [41]:
# Save merged dataset to CSV
merged.to_csv("michigan_health_bls_merged.csv", index=False)

print("✅ Exported merged dataset to michigan_health_bls_merged.csv")

✅ Exported merged dataset to michigan_health_bls_merged.csv
