In [32]:
# ===============================
# 1. Setup and Load BLS Data
# ===============================

import pandas as pd
import os

# Load the BLS CSV file
df = pd.read_csv("bls_mi/Michigan_county_employment.csv")

print("Shape of dataset:", df.shape)
print("Columns:", df.columns.tolist())
print("\nInfo:")
print(df.info())
display(df.head())

# Standardize column names
df = df.rename(columns={"county": "county_name", "year": "survey_year"})

# Collapse monthly values to yearly summaries
bls_summary = (
    df.groupby(["county_name", "survey_year"])
    .agg({
        "employment": ["mean", "min", "max"],
        "unemployment": ["mean", "min", "max"],
        "labor_force": ["mean", "min", "max"],
        "unemployment_rate": ["mean", "min", "max"]
    })
    .reset_index()
)

# Flatten hierarchical columns
bls_summary.columns = ["_".join(c).strip("_") for c in bls_summary.columns.values]

print("BLS summary shape:", bls_summary.shape)
print(bls_summary.columns)
allegan = bls_summary[bls_summary['county_name'] == 'Allegan County, MI']
display(allegan)

Shape of dataset: (37848, 7)
Columns: ['county', 'year', 'month', 'unemployment_rate', 'labor_force', 'employment', 'unemployment']

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37848 entries, 0 to 37847
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   county             37848 non-null  object 
 1   year               37848 non-null  int64  
 2   month              37848 non-null  int64  
 3   unemployment_rate  37848 non-null  float64
 4   labor_force        37848 non-null  float64
 5   employment         37848 non-null  float64
 6   unemployment       37848 non-null  float64
dtypes: float64(4), int64(2), object(1)
memory usage: 2.0+ MB
None


Unnamed: 0,county,year,month,unemployment_rate,labor_force,employment,unemployment
0,"Alcona County, MI",1990,1,16.6,636.0,3192.0,3828.0
1,"Alcona County, MI",1990,2,16.2,611.0,3171.0,3782.0
2,"Alcona County, MI",1990,3,15.8,608.0,3246.0,3854.0
3,"Alcona County, MI",1990,4,14.7,579.0,3369.0,3948.0
4,"Alcona County, MI",1990,5,11.7,468.0,3524.0,3992.0


BLS summary shape: (2988, 14)
Index(['county_name', 'survey_year', 'employment_mean', 'employment_min',
       'employment_max', 'unemployment_mean', 'unemployment_min',
       'unemployment_max', 'labor_force_mean', 'labor_force_min',
       'labor_force_max', 'unemployment_rate_mean', 'unemployment_rate_min',
       'unemployment_rate_max'],
      dtype='object')


Unnamed: 0,county_name,survey_year,employment_mean,employment_min,employment_max,unemployment_mean,unemployment_min,unemployment_max,labor_force_mean,labor_force_min,labor_force_max,unemployment_rate_mean,unemployment_rate_min,unemployment_rate_max
72,"Allegan County, MI",1990,43189.692308,42306.0,44049.0,45989.307692,45390.0,46926.0,2799.615385,2480.0,3084.0,6.1,5.3,6.8
73,"Allegan County, MI",1991,42206.153846,41596.0,42727.0,45397.461538,45013.0,45963.0,3191.307692,2689.0,3893.0,7.007692,6.0,8.5
74,"Allegan County, MI",1992,43001.076923,41364.0,44003.0,46239.384615,45268.0,47053.0,3238.307692,2472.0,4118.0,7.007692,5.4,9.0
75,"Allegan County, MI",1993,45664.153846,43474.0,47167.0,48436.076923,46928.0,49708.0,2771.923077,2154.0,3503.0,5.746154,4.4,7.4
76,"Allegan County, MI",1994,48434.538462,46560.0,49838.0,50684.076923,49869.0,51427.0,2249.538462,1577.0,3309.0,4.438462,3.1,6.6
77,"Allegan County, MI",1995,49654.615385,48704.0,50249.0,51614.846154,51094.0,52231.0,1960.230769,1619.0,2390.0,3.807692,3.1,4.7
78,"Allegan County, MI",1996,50862.230769,49004.0,52280.0,52748.230769,51559.0,53779.0,1886.0,1272.0,2555.0,3.592308,2.4,5.0
79,"Allegan County, MI",1997,53459.153846,51654.0,54570.0,55118.692308,53834.0,56108.0,1659.538462,1225.0,2195.0,3.0,2.2,4.0
80,"Allegan County, MI",1998,54737.076923,53658.0,55581.0,56281.307692,55613.0,56958.0,1544.230769,1115.0,2104.0,2.746154,2.0,3.7
81,"Allegan County, MI",1999,56429.846154,55191.0,57710.0,57969.076923,57088.0,59107.0,1539.230769,1175.0,2021.0,2.661538,2.0,3.5


In [37]:
# ===============================
# 2. Load and Clean CDC Health Core Data
# ===============================

data_dir = "cdc_mi"
files = [f for f in os.listdir(data_dir) if f.endswith(".csv")]

dfs = []
for file in files:
    file_path = os.path.join(data_dir, file)
    df = pd.read_csv(file_path)
    
    # Keep only California rows
    if "state_name" in df.columns:
        df = df[df["state_name"].str.contains("Michigan", case=False, na=False)]
    elif "_state" in df.columns:
        df = df[df["_state"] == 26]  # Michigan Fips
    
    # Restrict years
    if "survey_year" in df.columns:
        df = df[(df["survey_year"] >= 1993) & (df["survey_year"] <= 2010)]
    
    dfs.append(df)

# Combine all years
health_core_data = pd.concat(dfs, ignore_index=True)

# Standardize county_name → "County, ST" format
health_core_data["county_name"] = (
    health_core_data["county_name"].str.strip() + ", " +
    health_core_data["state_name"].str.strip().str[:2].str.upper()
)

print("Health core shape:", health_core_data.shape)
print("Columns:", health_core_data.columns.tolist()[:12], "...")
print("Earliest year:", health_core_data["survey_year"].min())
print("Latest year:", health_core_data["survey_year"].max())

Health core shape: (301576, 67)
Columns: ['county_name', 'state_name', 'survey_year', 'respondent_age', 'respondent_sex', 'self_reported_race', 'hispanic_ethnicity', 'marital_status', 'education_level', 'employment_status', 'num_adults_in_household', 'num_men_in_household'] ...
Earliest year: 1993
Latest year: 2010


Unnamed: 0,county_name,state_name,survey_year,respondent_age,respondent_sex,self_reported_race,hispanic_ethnicity,marital_status,education_level,employment_status,...,ever_told_asthma,ever_told_heart_attack,ever_told_coronary_heart_disease,ever_told_stroke,ever_told_arthritis,ever_had_sigmoidoscopy_colonoscopy,high_blood_pressure_flag,tobacco_use_flag,obesity_flag,overweight_or_obese_flag


In [64]:
print("Columns for analysis:", health_core_data.columns.tolist()[:12], "...")

allegansub = health_core_data['county_name'].unique()

display(allegansub)

Columns for analysis: ['county_name', 'state_name', 'survey_year', 'respondent_age', 'respondent_sex', 'self_reported_race', 'hispanic_ethnicity', 'marital_status', 'education_level', 'employment_status', 'num_adults_in_household', 'num_men_in_household'] ...


array(['Genesee County, MI', 'Ingham County, MI', 'Kent County, MI',
       'Macomb County, MI', 'Oakland County, MI', 'St. Clair County, MI',
       'Washtenaw County, MI', 'Wayne County, MI', 'Berrien County, MI',
       'Kalamazoo County, MI', 'Ottawa County, MI', 'Muskegon County, MI',
       'Saginaw County, MI', 'Van Buren County, MI',
       'Grand Traverse County, MI', 'Roscommon County, MI',
       'Lenawee County, MI', 'Leelanau County, MI',
       'Cheboygan County, MI', 'Huron County, MI', 'Alpena County, MI',
       'Lapeer County, MI', 'Eaton County, MI', 'Hillsdale County, MI',
       'Gratiot County, MI', 'Kalkaska County, MI', 'Clare County, MI',
       'Montcalm County, MI', 'Allegan County, MI', 'Branch County, MI',
       'Iosco County, MI', 'Tuscola County, MI', 'Antrim County, MI',
       'Gladwin County, MI', 'Wexford County, MI', 'Ogemaw County, MI',
       'Benzie County, MI', 'Monroe County, MI', 'Emmet County, MI',
       'Clinton County, MI', 'Cass County, M

In [20]:
# ===============================
# 3. Categorical Percent Breakdowns
# ===============================

cat_vars_mi = [
    "general_health_status",
    "smoked_100_cigarettes",
    "eats_fruit",
    "eats_other_vegetables",
    "cholesterol_checked_5yr",
    "currently_has_asthma",
    "ever_had_mammogram",
    "teeth_cleaned_by_dentist",
    "current_smoking_frequency",
    "last_dentist_visit",
    "smoking_status_recode",
    "ever_told_diabetes",
    "flu_shot_past_year",
    "ever_had_pap_smear",
    "ever_told_high_bp",
    "any_alcohol_past_month",
    "any_physical_activity",
    "ever_told_high_cholesterol",
    "ever_told_asthma",
    "ever_told_heart_attack",
    "ever_told_coronary_heart_disease",
    "ever_told_stroke",
    "ever_told_arthritis",
    "ever_had_sigmoidoscopy_colonoscopy",
    "high_blood_pressure_flag",
    "tobacco_use_flag",
    "obesity_flag",
    "overweight_or_obese_flag",
]
percent_dfs = []

for col in cat_vars_mi:
    temp = (
        health_core_data
        .groupby(["county_name", "survey_year", col])
        .size()
        .reset_index(name="count")
    )
    
    temp["percent"] = (
        temp.groupby(["county_name", "survey_year"])["count"]
        .transform(lambda x: 100 * x / x.sum())
    )
    
    temp["variable"] = col
    temp.rename(columns={col: "category"}, inplace=True)
    temp = temp[["county_name", "survey_year", "variable", "category", "percent"]]
    
    percent_dfs.append(temp)

percent_breakdowns = pd.concat(percent_dfs, ignore_index=True)

print("Percent breakdowns shape:", percent_breakdowns.shape)
display(percent_breakdowns.head(20))
display(percent_breakdowns['variable'].unique())

Percent breakdowns shape: (20265, 5)


Unnamed: 0,county_name,survey_year,variable,category,percent
0,"Allegan County, MI",2002,general_health_status,1.0,20.895522
1,"Allegan County, MI",2002,general_health_status,2.0,37.313433
2,"Allegan County, MI",2002,general_health_status,3.0,29.850746
3,"Allegan County, MI",2002,general_health_status,4.0,2.985075
4,"Allegan County, MI",2002,general_health_status,5.0,7.462687
5,"Allegan County, MI",2002,general_health_status,9.0,1.492537
6,"Allegan County, MI",2004,general_health_status,1.0,15.384615
7,"Allegan County, MI",2004,general_health_status,2.0,30.769231
8,"Allegan County, MI",2004,general_health_status,3.0,38.461538
9,"Allegan County, MI",2004,general_health_status,4.0,12.307692


array(['general_health_status', 'smoked_100_cigarettes', 'eats_fruit',
       'eats_other_vegetables', 'cholesterol_checked_5yr',
       'currently_has_asthma', 'ever_had_mammogram',
       'teeth_cleaned_by_dentist', 'current_smoking_frequency',
       'last_dentist_visit', 'smoking_status_recode',
       'ever_told_diabetes', 'flu_shot_past_year', 'ever_had_pap_smear',
       'ever_told_high_bp', 'any_alcohol_past_month',
       'any_physical_activity', 'ever_told_high_cholesterol',
       'ever_told_asthma', 'ever_told_arthritis',
       'ever_had_sigmoidoscopy_colonoscopy', 'high_blood_pressure_flag',
       'tobacco_use_flag', 'obesity_flag', 'overweight_or_obese_flag'],
      dtype=object)

In [22]:
# ===============================
# 4. Pivot CDC Data to Wide Format
# ===============================

cdc_wide = (
    percent_breakdowns
    .pivot_table(
        index=["county_name", "survey_year"],
        columns=["variable", "category"],
        values="percent",
        fill_value=0
    )
)

cdc_wide.columns = [
    f"{var}_{int(cat) if isinstance(cat, (int, float)) and float(cat).is_integer() else cat}"
    for var, cat in cdc_wide.columns
]

cdc_wide = cdc_wide.reset_index()

print("CDC wide shape:", cdc_wide.shape)
print(cdc_wide.dropna(how='all'))
display(cdc_wide.head())

CDC wide shape: (521, 280)
            county_name  survey_year  any_alcohol_past_month_1  \
0    Allegan County, MI         2002                       0.0   
1    Allegan County, MI         2004                       0.0   
2    Allegan County, MI         2005                       0.0   
3    Allegan County, MI         2006                       0.0   
4    Allegan County, MI         2007                       0.0   
..                  ...          ...                       ...   
516  Wexford County, MI         2005                       0.0   
517  Wexford County, MI         2007                       0.0   
518  Wexford County, MI         2008                       0.0   
519  Wexford County, MI         2009                       0.0   
520  Wexford County, MI         2010                       0.0   

     any_alcohol_past_month_2  any_alcohol_past_month_7  \
0                         0.0                       0.0   
1                         0.0                       0.0   
2  

Unnamed: 0,county_name,survey_year,any_alcohol_past_month_1,any_alcohol_past_month_2,any_alcohol_past_month_7,any_physical_activity_1,any_physical_activity_2,any_physical_activity_7,cholesterol_checked_5yr_1,cholesterol_checked_5yr_2,...,smoking_status_recode_6,smoking_status_recode_9,teeth_cleaned_by_dentist_1,teeth_cleaned_by_dentist_2,teeth_cleaned_by_dentist_3,teeth_cleaned_by_dentist_4,teeth_cleaned_by_dentist_7,teeth_cleaned_by_dentist_8,teeth_cleaned_by_dentist_9,tobacco_use_flag_9
0,"Allegan County, MI",2002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,84.126984,6.349206,3.174603,4.761905,1.587302,0.0,0.0,0.0
1,"Allegan County, MI",2004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,71.666667,10.0,8.333333,10.0,0.0,0.0,0.0,0.0
2,"Allegan County, MI",2005,0.0,0.0,0.0,0.0,0.0,0.0,68.333333,18.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Allegan County, MI",2006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,73.846154,6.153846,6.153846,10.769231,0.0,3.076923,0.0,0.0
4,"Allegan County, MI",2007,0.0,0.0,0.0,0.0,0.0,0.0,73.469388,12.244898,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
# ===============================
# 5. Apply Readable Labels (Codebook)
# ===============================

codebook_map = {
    "general_health_status": {
        1: "Excellent", 2: "Very good", 3: "Good", 4: "Fair", 5: "Poor",
        7: "Don’t know/Not sure", 9: "Refused"
    },
    "ever_told_high_bp": {
        1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"
    },
    "ever_told_diabetes": {
        1: "Yes", 2: "No", 3: "Yes, during pregnancy", 4: "Pre-diabetes/Borderline",
        7: "Don’t know/Not sure", 9: "Refused"
    },
    "smoked_100_cigarettes": {
        1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"
    },
    "current_smoking_frequency": {
        1: "Every day", 2: "Some days", 3: "Not at all",
        7: "Don’t know/Not sure", 9: "Refused"
    },
    "any_alcohol_past_month": {
        1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"
    },
    "eats_fruit": {
        1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"
    },
    "eats_other_vegetables": {
        1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"
    },
    "any_physical_activity": {
        1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"
    }
}

def decode_food_code(val):
    if val in [1, 2, 7, 9]:
        return {1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"}[val]
    elif 101 <= val <= 199:
        return f"{val - 100} times/day"
    elif 201 <= val <= 299:
        return f"{val - 200} times/week"
    elif 301 <= val <= 399:
        return f"{val - 300} times/month"
    else:
        return f"Code {val}"

rename_map = {}
for col in cdc_wide.columns:
    if col in ["county_name", "survey_year"]:
        continue
    var, cat = col.rsplit("_", 1)
    try:
        cat_int = int(cat)
    except:
        cat_int = cat
    
    if var in codebook_map and cat_int in codebook_map[var]:
        rename_map[col] = f"{var} - {codebook_map[var][cat_int]}"
    elif var in ["eats_fruit", "eats_other_vegetables"]:
        rename_map[col] = f"{var} - {decode_food_code(cat_int)}"
    else:
        rename_map[col] = col

cdc_wide = cdc_wide.rename(columns=rename_map)

print("CDC wide with labels shape:", cdc_wide.shape)
display(cdc_wide.head())

CDC wide with labels shape: (521, 280)


Unnamed: 0,county_name,survey_year,any_alcohol_past_month - Yes,any_alcohol_past_month - No,any_alcohol_past_month - Don’t know/Not sure,any_physical_activity - Yes,any_physical_activity - No,any_physical_activity - Don’t know/Not sure,cholesterol_checked_5yr_1,cholesterol_checked_5yr_2,...,smoking_status_recode_6,smoking_status_recode_9,teeth_cleaned_by_dentist_1,teeth_cleaned_by_dentist_2,teeth_cleaned_by_dentist_3,teeth_cleaned_by_dentist_4,teeth_cleaned_by_dentist_7,teeth_cleaned_by_dentist_8,teeth_cleaned_by_dentist_9,tobacco_use_flag_9
0,"Allegan County, MI",2002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,84.126984,6.349206,3.174603,4.761905,1.587302,0.0,0.0,0.0
1,"Allegan County, MI",2004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,71.666667,10.0,8.333333,10.0,0.0,0.0,0.0,0.0
2,"Allegan County, MI",2005,0.0,0.0,0.0,0.0,0.0,0.0,68.333333,18.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Allegan County, MI",2006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,73.846154,6.153846,6.153846,10.769231,0.0,3.076923,0.0,0.0
4,"Allegan County, MI",2007,0.0,0.0,0.0,0.0,0.0,0.0,73.469388,12.244898,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
# ===============================
# 6. Merge CDC Health and BLS Labor Data
# ===============================

merged = pd.merge(
    cdc_wide,
    bls_summary,
    on=["county_name", "survey_year"],
    how="inner"   # inner join: only keep counties/years that appear in both
)

print("Final merged shape:", merged.shape)
display(merged.head())

Final merged shape: (521, 292)


Unnamed: 0,county_name,survey_year,any_alcohol_past_month - Yes,any_alcohol_past_month - No,any_alcohol_past_month - Don’t know/Not sure,any_physical_activity - Yes,any_physical_activity - No,any_physical_activity - Don’t know/Not sure,cholesterol_checked_5yr_1,cholesterol_checked_5yr_2,...,employment_max,unemployment_mean,unemployment_min,unemployment_max,labor_force_mean,labor_force_min,labor_force_max,unemployment_rate_mean,unemployment_rate_min,unemployment_rate_max
0,"Allegan County, MI",2002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,53391.0,55116.384615,53223.0,56804.0,3098.615385,2456.0,3558.0,5.623077,4.5,6.5
1,"Allegan County, MI",2004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,51389.0,53465.230769,52608.0,55016.0,3430.846154,2994.0,3857.0,6.430769,5.7,7.3
2,"Allegan County, MI",2005,0.0,0.0,0.0,0.0,0.0,0.0,68.333333,18.333333,...,53006.0,55153.0,53297.0,56522.0,3445.384615,2929.0,4086.0,6.246154,5.3,7.6
3,"Allegan County, MI",2006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,54224.0,56370.076923,55270.0,57793.0,3468.076923,2892.0,3883.0,6.161538,5.2,7.0
4,"Allegan County, MI",2007,0.0,0.0,0.0,0.0,0.0,0.0,73.469388,12.244898,...,53748.0,56008.615385,55096.0,57299.0,3568.384615,3119.0,3989.0,6.376923,5.6,7.2


In [48]:
# Save merged dataset to CSV
merged.to_csv("michigan_health_bls_merged.csv", index=False)

print("✅ Exported merged dataset to michigan_health_bls_merged.csv")

✅ Exported merged dataset to michigan_health_bls_merged.csv
