In [1]:
import pandas as pd
import os

In [2]:
# Load the CSV file

df = pd.read_csv("bls_mi/Michigan_county_employment.csv")
print(df.head(5))
# Show the shape (rows, columns)
print("Shape of dataset:", df.shape)

# Show column names
print("Columns:", df.columns.tolist())

# Show data types and non-null counts
print("\nInfo:")
print(df.info())

# Show the first 5 rows
print("\nHead:")
display(df.head())

              county  year  month  unemployment_rate  labor_force  employment  \
0  Alcona County, MI  1990      1               16.6        636.0      3192.0   
1  Alcona County, MI  1990      2               16.2        611.0      3171.0   
2  Alcona County, MI  1990      3               15.8        608.0      3246.0   
3  Alcona County, MI  1990      4               14.7        579.0      3369.0   
4  Alcona County, MI  1990      5               11.7        468.0      3524.0   

   unemployment  
0        3828.0  
1        3782.0  
2        3854.0  
3        3948.0  
4        3992.0  
Shape of dataset: (37848, 7)
Columns: ['county', 'year', 'month', 'unemployment_rate', 'labor_force', 'employment', 'unemployment']

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37848 entries, 0 to 37847
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   county             37848 non-null  object 
 1   year     

Unnamed: 0,county,year,month,unemployment_rate,labor_force,employment,unemployment
0,"Alcona County, MI",1990,1,16.6,636.0,3192.0,3828.0
1,"Alcona County, MI",1990,2,16.2,611.0,3171.0,3782.0
2,"Alcona County, MI",1990,3,15.8,608.0,3246.0,3854.0
3,"Alcona County, MI",1990,4,14.7,579.0,3369.0,3948.0
4,"Alcona County, MI",1990,5,11.7,468.0,3524.0,3992.0


In [3]:
# Standardize column names
df = df.rename(columns={"county": "county_name", "year": "survey_year"})

# Collapse monthly values to yearly summaries
bls_summary = (
    df.groupby(["county_name", "survey_year"])
    .agg({
        "employment": ["mean", "min", "max"],
        "unemployment": ["mean", "min", "max"],
        "labor_force": ["mean", "min", "max"],
        "unemployment_rate": ["mean", "min", "max"]
    })
    .reset_index()
)

# Flatten hierarchical columns
bls_summary.columns = ["_".join(c).strip("_") for c in bls_summary.columns.values]

In [4]:
df.head()

Unnamed: 0,county_name,survey_year,month,unemployment_rate,labor_force,employment,unemployment
0,"Alcona County, MI",1990,1,16.6,636.0,3192.0,3828.0
1,"Alcona County, MI",1990,2,16.2,611.0,3171.0,3782.0
2,"Alcona County, MI",1990,3,15.8,608.0,3246.0,3854.0
3,"Alcona County, MI",1990,4,14.7,579.0,3369.0,3948.0
4,"Alcona County, MI",1990,5,11.7,468.0,3524.0,3992.0


In [5]:
bls_summary.head()

Unnamed: 0,county_name,survey_year,employment_mean,employment_min,employment_max,unemployment_mean,unemployment_min,unemployment_max,labor_force_mean,labor_force_min,labor_force_max,unemployment_rate_mean,unemployment_rate_min,unemployment_rate_max
0,"Alcona County, MI",1990,3420.538462,3171.0,3662.0,3933.923077,3782.0,4190.0,513.384615,382.0,636.0,13.084615,9.8,16.6
1,"Alcona County, MI",1991,3368.846154,3112.0,3643.0,3937.923077,3770.0,4183.0,569.076923,342.0,739.0,14.438462,9.0,19.1
2,"Alcona County, MI",1992,3473.307692,3242.0,3712.0,4138.692308,3985.0,4428.0,665.384615,493.0,805.0,16.092308,11.8,19.6
3,"Alcona County, MI",1993,3870.153846,3683.0,4118.0,4493.307692,4347.0,4721.0,623.153846,525.0,737.0,13.892308,11.4,16.7
4,"Alcona County, MI",1994,4276.846154,3845.0,4591.0,4896.538462,4659.0,5205.0,619.692308,446.0,814.0,12.723077,9.2,17.5


In [6]:
data_dir = "cdc_mi"

# Collect only health_core files
files = [f for f in os.listdir(data_dir) if f.endswith("health_core.csv")]

dfs = []
for file in files:
    file_path = os.path.join(data_dir, file)
    df = pd.read_csv(file_path)
    
    # ✅ Keep only California rows
    if "state_name" in df.columns:
        df = df[df["state_name"].str.contains("California", case=False, na=False)]
    elif "_state" in df.columns:
        df = df[df["_state"] == 6]  # California FIPS
    
    # ✅ Filter years 1993–2010
    if "survey_year" in df.columns:
        df = df[(df["survey_year"] >= 1993) & (df["survey_year"] <= 2010)]
    
    dfs.append(df)

# Combine all health_core years
health_core_data = pd.concat(dfs, ignore_index=True)

# ✅ Standardize county_name → "County, ST" format (matches BLS)
health_core_data["county_name"] = (
    health_core_data["county_name"].str.strip() + ", " +
    health_core_data["state_name"].str.strip().str[:2].str.upper()
)

print("Final shape:", health_core_data.shape)
print("Columns:", health_core_data.columns.tolist()[:12], "...")
display(health_core_data.head())

Final shape: (17320, 8)
Columns: ['county_name', 'state_name', 'survey_year', 'general_health_status', 'days_physical_health_not_good', 'days_mental_health_not_good', 'days_poor_health_limited_activities', 'smoked_100_cigarettes'] ...


Unnamed: 0,county_name,state_name,survey_year,general_health_status,days_physical_health_not_good,days_mental_health_not_good,days_poor_health_limited_activities,smoked_100_cigarettes
0,"Alameda County, CA",California,2010,2.0,88.0,25.0,88.0,2.0
1,"Alameda County, CA",California,2010,3.0,88.0,88.0,,2.0
2,"Alameda County, CA",California,2010,3.0,7.0,2.0,88.0,2.0
3,"Alameda County, CA",California,2010,3.0,15.0,30.0,3.0,2.0
4,"Alameda County, CA",California,2010,1.0,88.0,2.0,88.0,1.0


In [7]:
print("Earliest year:", health_core_data["survey_year"].min())
print("Latest year:", health_core_data["survey_year"].max())

Earliest year: 2010
Latest year: 2010


In [8]:
print("Columns:", health_core_data.columns.tolist())

Columns: ['county_name', 'state_name', 'survey_year', 'general_health_status', 'days_physical_health_not_good', 'days_mental_health_not_good', 'days_poor_health_limited_activities', 'smoked_100_cigarettes']


In [9]:
# Candidate categorical variables (confirmed from your dataset)
cat_vars = [
    "general_health_status",
    "ever_told_high_bp",
    "ever_told_diabetes",
    "smoked_100_cigarettes",
    "current_smoking_frequency",
    "any_alcohol_past_month",
    "eats_fruit",
    "eats_other_vegetables",
    "any_physical_activity"
]

percent_dfs = []

for col in cat_vars:
    # Count responses per category
    temp = (
        health_core_data
        .groupby(["county_name", "survey_year", col])
        .size()
        .reset_index(name="count")
    )
    
    # Normalize counts into percentages within each county × year
    temp["percent"] = (
        temp.groupby(["county_name", "survey_year"])["count"]
        .transform(lambda x: 100 * x / x.sum())
    )
    
    # Tag variable name and unify structure
    temp["variable"] = col
    temp.rename(columns={col: "category"}, inplace=True)
    temp = temp[["county_name", "survey_year", "variable", "category", "percent"]]
    
    percent_dfs.append(temp)

# Combine all categorical breakdowns into one tidy dataframe
percent_breakdowns = pd.concat(percent_dfs, ignore_index=True)

print("Shape:", percent_breakdowns.shape)
display(percent_breakdowns.head(20))

KeyError: 'ever_told_high_bp'

In [138]:
cdc_wide = (
    percent_breakdowns
    .pivot_table(
        index=["county_name", "survey_year"],
        columns=["variable", "category"],
        values="percent",
        fill_value=0
    )
)

# Flatten MultiIndex → clean column names
cdc_wide.columns = [
    f"{var}_{int(cat) if isinstance(cat, (int, float)) and float(cat).is_integer() else cat}"
    for var, cat in cdc_wide.columns
]

cdc_wide = cdc_wide.reset_index()

print("Final shape:", cdc_wide.shape)
display(cdc_wide.head())

Final shape: (492, 225)


Unnamed: 0,county_name,survey_year,any_alcohol_past_month_1,any_alcohol_past_month_2,any_alcohol_past_month_7,any_alcohol_past_month_9,any_physical_activity_1,any_physical_activity_2,any_physical_activity_7,current_smoking_frequency_1,...,general_health_status_2,general_health_status_3,general_health_status_4,general_health_status_5,general_health_status_7,general_health_status_9,smoked_100_cigarettes_1,smoked_100_cigarettes_2,smoked_100_cigarettes_7,smoked_100_cigarettes_9
0,"Alameda County, CA",1993,62.318841,36.956522,0.724638,0.0,0.0,0.0,0.0,0.0,...,38.405797,24.637681,7.971014,4.347826,0.0,0.0,41.304348,58.695652,0.0,0.0
1,"Alameda County, CA",1994,0.0,0.0,0.0,0.0,78.698225,21.301775,0.0,0.0,...,36.094675,31.360947,10.650888,3.550296,0.591716,0.0,46.745562,53.254438,0.0,0.0
2,"Alameda County, CA",1995,68.27957,31.72043,0.0,0.0,81.182796,18.817204,0.0,0.0,...,36.55914,32.258065,6.451613,2.688172,0.0,0.0,47.849462,51.075269,1.075269,0.0
3,"Alameda County, CA",1996,63.190184,36.809816,0.0,0.0,84.04908,15.95092,0.0,29.487179,...,34.355828,29.447853,4.907975,3.067485,0.0,0.0,47.852761,52.147239,0.0,0.0
4,"Alameda County, CA",1997,73.053892,26.946108,0.0,0.0,0.0,0.0,0.0,21.794872,...,43.113772,22.155689,7.784431,1.197605,0.0,0.0,46.706587,53.293413,0.0,0.0


In [144]:
# Define category label dictionary
codebook_map = {
    "general_health_status": {
        1: "Excellent", 2: "Very good", 3: "Good", 4: "Fair", 5: "Poor",
        7: "Don’t know/Not sure", 9: "Refused"
    },
    "ever_told_high_bp": {
        1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"
    },
    "ever_told_diabetes": {
        1: "Yes", 2: "No", 3: "Yes, during pregnancy", 4: "Pre-diabetes/Borderline",
        7: "Don’t know/Not sure", 9: "Refused"
    },
    "smoked_100_cigarettes": {
        1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"
    },
    "current_smoking_frequency": {
        1: "Every day", 2: "Some days", 3: "Not at all",
        7: "Don’t know/Not sure", 9: "Refused"
    },
    "any_alcohol_past_month": {
        1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"
    },
    "eats_fruit": {
        1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"
    },
    "eats_other_vegetables": {
        1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"
    },
    "any_physical_activity": {
        1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"
    }
}

# Special decoder for frequency codes (fruit/veg etc.)
def decode_food_code(val):
    if val in [1, 2, 7, 9]:
        return {1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"}[val]
    elif 101 <= val <= 199:
        return f"{val - 100} times/day"
    elif 201 <= val <= 299:
        return f"{val - 200} times/week"
    elif 301 <= val <= 399:
        return f"{val - 300} times/month"
    else:
        return f"Code {val}"

# Build rename mapping
rename_map = {}
for col in cdc_wide.columns:
    if col in ["county_name", "survey_year"]:
        continue
    var, cat = col.rsplit("_", 1)
    try:
        cat_int = int(cat)
    except:
        cat_int = cat
    
    if var in codebook_map and cat_int in codebook_map[var]:
        rename_map[col] = f"{var} - {codebook_map[var][cat_int]}"
    elif var in ["eats_fruit", "eats_other_vegetables"]:
        rename_map[col] = f"{var} - {decode_food_code(cat_int)}"
    else:
        rename_map[col] = col

In [146]:
cdc_wide = cdc_wide.rename(columns=rename_map)

print("Final shape:", cdc_wide.shape)
display(cdc_wide.head())

Final shape: (492, 225)


Unnamed: 0,county_name,survey_year,any_alcohol_past_month - Yes,any_alcohol_past_month - No,any_alcohol_past_month - Don’t know/Not sure,any_alcohol_past_month - Refused,any_physical_activity - Yes,any_physical_activity - No,any_physical_activity - Don’t know/Not sure,current_smoking_frequency - Every day,...,general_health_status - Very good,general_health_status - Good,general_health_status - Fair,general_health_status - Poor,general_health_status - Don’t know/Not sure,general_health_status - Refused,smoked_100_cigarettes - Yes,smoked_100_cigarettes - No,smoked_100_cigarettes - Don’t know/Not sure,smoked_100_cigarettes - Refused
0,"Alameda County, CA",1993,62.318841,36.956522,0.724638,0.0,0.0,0.0,0.0,0.0,...,38.405797,24.637681,7.971014,4.347826,0.0,0.0,41.304348,58.695652,0.0,0.0
1,"Alameda County, CA",1994,0.0,0.0,0.0,0.0,78.698225,21.301775,0.0,0.0,...,36.094675,31.360947,10.650888,3.550296,0.591716,0.0,46.745562,53.254438,0.0,0.0
2,"Alameda County, CA",1995,68.27957,31.72043,0.0,0.0,81.182796,18.817204,0.0,0.0,...,36.55914,32.258065,6.451613,2.688172,0.0,0.0,47.849462,51.075269,1.075269,0.0
3,"Alameda County, CA",1996,63.190184,36.809816,0.0,0.0,84.04908,15.95092,0.0,29.487179,...,34.355828,29.447853,4.907975,3.067485,0.0,0.0,47.852761,52.147239,0.0,0.0
4,"Alameda County, CA",1997,73.053892,26.946108,0.0,0.0,0.0,0.0,0.0,21.794872,...,43.113772,22.155689,7.784431,1.197605,0.0,0.0,46.706587,53.293413,0.0,0.0


In [148]:
# Preview first 20 mappings
for k, v in list(col_labels.items())[:20]:
    print(f"{k} → {v}")

county_name → county_name
survey_year → survey_year
any_alcohol_past_month_1 → % Yes (any_alcohol_past_month)
any_alcohol_past_month_2 → % No (any_alcohol_past_month)
any_alcohol_past_month_7 → % Don’t know/Not sure (any_alcohol_past_month)
any_alcohol_past_month_9 → % Refused (any_alcohol_past_month)
any_physical_activity_1 → % Yes (any_physical_activity)
any_physical_activity_2 → % No (any_physical_activity)
any_physical_activity_7 → % Don’t know/Not sure (any_physical_activity)
current_smoking_frequency_1 → % Every day (current_smoking_frequency)
current_smoking_frequency_2 → % Some days (current_smoking_frequency)
current_smoking_frequency_3 → % Not at all (current_smoking_frequency)
current_smoking_frequency_9 → % Refused (current_smoking_frequency)
eats_fruit_101 → eats_fruit_101
eats_fruit_102 → eats_fruit_102
eats_fruit_103 → eats_fruit_103
eats_fruit_104 → eats_fruit_104
eats_fruit_105 → eats_fruit_105
eats_fruit_106 → eats_fruit_106
eats_fruit_107 → eats_fruit_107
