In [1]:
# ===============================
# 1. Setup and Load BLS Data
# ===============================

import pandas as pd
import os

# Load the BLS CSV file
df = pd.read_csv("bls_california/california_county_employment.csv")

print("Shape of dataset:", df.shape)
print("Columns:", df.columns.tolist())
print("\nInfo:")
print(df.info())
display(df.head())

# Standardize column names
df = df.rename(columns={"county": "county_name", "year": "survey_year"})

# Collapse monthly values to yearly summaries
bls_summary = (
    df.groupby(["county_name", "survey_year"])
    .agg({
        "employment": ["mean", "min", "max"],
        "unemployment": ["mean", "min", "max"],
        "labor_force": ["mean", "min", "max"],
        "unemployment_rate": ["mean", "min", "max"]
    })
    .reset_index()
)

# Flatten hierarchical columns
bls_summary.columns = ["_".join(c).strip("_") for c in bls_summary.columns.values]

print("BLS summary shape:", bls_summary.shape)
display(bls_summary.head())

Shape of dataset: (26449, 7)
Columns: ['county', 'year', 'month', 'employment', 'unemployment', 'labor_force', 'unemployment_rate']

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26449 entries, 0 to 26448
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   county             26449 non-null  object 
 1   year               26449 non-null  int64  
 2   month              26449 non-null  int64  
 3   employment         26449 non-null  float64
 4   unemployment       26449 non-null  float64
 5   labor_force        26449 non-null  float64
 6   unemployment_rate  26449 non-null  float64
dtypes: float64(4), int64(2), object(1)
memory usage: 1.4+ MB
None


Unnamed: 0,county,year,month,employment,unemployment,labor_force,unemployment_rate
0,"Alameda County, CA",1990,1,657058.0,683503.0,26445.0,3.9
1,"Alameda County, CA",1990,2,650679.0,676039.0,25360.0,3.8
2,"Alameda County, CA",1990,3,649735.0,673971.0,24236.0,3.6
3,"Alameda County, CA",1990,4,645659.0,671250.0,25591.0,3.8
4,"Alameda County, CA",1990,5,646698.0,671949.0,25251.0,3.8


BLS summary shape: (2088, 14)


Unnamed: 0,county_name,survey_year,employment_mean,employment_min,employment_max,unemployment_mean,unemployment_min,unemployment_max,labor_force_mean,labor_force_min,labor_force_max,unemployment_rate_mean,unemployment_rate_min,unemployment_rate_max
0,"Alameda County, CA",1990,650569.538462,645537.0,657058.0,678021.153846,671250.0,683547.0,27451.615385,24236.0,30648.0,4.053846,3.6,4.5
1,"Alameda County, CA",1991,635627.307692,625365.0,641120.0,672547.230769,663918.0,682083.0,36919.923077,34344.0,40963.0,5.492308,5.1,6.0
2,"Alameda County, CA",1992,632270.230769,628172.0,638912.0,677020.615385,670513.0,681892.0,44750.384615,41567.0,49198.0,6.607692,6.1,7.2
3,"Alameda County, CA",1993,632021.076923,624134.0,639235.0,677319.923077,669864.0,681111.0,45298.846154,39404.0,49620.0,6.676923,5.8,7.3
4,"Alameda County, CA",1994,636908.769231,633433.0,643742.0,678967.769231,675462.0,684694.0,42059.0,32608.0,46901.0,6.192308,4.8,6.9


In [3]:
# ===============================
# 2. Load and Clean CDC Health Core Data
# ===============================

data_dir = "cdc_california"
files = [f for f in os.listdir(data_dir) if f.endswith("health_core.csv")]

dfs = []
for file in files:
    file_path = os.path.join(data_dir, file)
    df = pd.read_csv(file_path)
    
    # Keep only California rows
    if "state_name" in df.columns:
        df = df[df["state_name"].str.contains("California", case=False, na=False)]
    elif "_state" in df.columns:
        df = df[df["_state"] == 6]  # California FIPS
    
    # Restrict years
    if "survey_year" in df.columns:
        df = df[(df["survey_year"] >= 1993) & (df["survey_year"] <= 2010)]
    
    dfs.append(df)

# Combine all years
health_core_data = pd.concat(dfs, ignore_index=True)

# Standardize county_name → "County, ST" format
health_core_data["county_name"] = (
    health_core_data["county_name"].str.strip() + ", " +
    health_core_data["state_name"].str.strip().str[:2].str.upper()
)

print("Health core shape:", health_core_data.shape)
print("Columns:", health_core_data.columns.tolist()[:12], "...")
print("Earliest year:", health_core_data["survey_year"].min())
print("Latest year:", health_core_data["survey_year"].max())
display(health_core_data.head())

Health core shape: (103680, 19)
Columns: ['county_name', 'state_name', 'survey_year', 'general_health_status', 'days_physical_health_not_good', 'days_mental_health_not_good', 'days_poor_health_limited_activities', 'ever_told_high_bp', 'ever_told_diabetes', 'smoked_100_cigarettes', 'current_smoking_frequency', 'any_alcohol_past_month'] ...
Earliest year: 1993
Latest year: 2010


Unnamed: 0,county_name,state_name,survey_year,general_health_status,days_physical_health_not_good,days_mental_health_not_good,days_poor_health_limited_activities,ever_told_high_bp,ever_told_diabetes,smoked_100_cigarettes,current_smoking_frequency,any_alcohol_past_month,avg_drinks_per_week,body_mass_index,respondent_weight_pounds,respondent_height_inches,eats_fruit,eats_other_vegetables,any_physical_activity
0,"Alameda County, CA",California,1996,2.0,88.0,88.0,,,3.0,1.0,3.0,2.0,999.0,217.0,138.0,507.0,203.0,106.0,1.0
1,"Alameda County, CA",California,1996,2.0,3.0,88.0,88.0,,3.0,1.0,3.0,1.0,999.0,204.0,115.0,503.0,101.0,101.0,1.0
2,"Alameda County, CA",California,1996,2.0,88.0,88.0,,,3.0,2.0,,1.0,999.0,252.0,175.0,510.0,203.0,207.0,2.0
3,"Alameda County, CA",California,1996,2.0,88.0,88.0,,,3.0,1.0,3.0,1.0,999.0,250.0,150.0,505.0,101.0,102.0,1.0
4,"Alameda County, CA",California,1996,2.0,88.0,88.0,,,3.0,2.0,,1.0,999.0,270.0,138.0,500.0,101.0,101.0,1.0


In [5]:
# ===============================
# 3. Categorical Percent Breakdowns
# ===============================

cat_vars = [
    "general_health_status",
    "ever_told_high_bp",
    "ever_told_diabetes",
    "smoked_100_cigarettes",
    "current_smoking_frequency",
    "any_alcohol_past_month",
    "eats_fruit",
    "eats_other_vegetables",
    "any_physical_activity"
]

percent_dfs = []

for col in cat_vars:
    temp = (
        health_core_data
        .groupby(["county_name", "survey_year", col])
        .size()
        .reset_index(name="count")
    )
    
    temp["percent"] = (
        temp.groupby(["county_name", "survey_year"])["count"]
        .transform(lambda x: 100 * x / x.sum())
    )
    
    temp["variable"] = col
    temp.rename(columns={col: "category"}, inplace=True)
    temp = temp[["county_name", "survey_year", "variable", "category", "percent"]]
    
    percent_dfs.append(temp)

percent_breakdowns = pd.concat(percent_dfs, ignore_index=True)

print("Percent breakdowns shape:", percent_breakdowns.shape)
display(percent_breakdowns.head(20))

Percent breakdowns shape: (14851, 5)


Unnamed: 0,county_name,survey_year,variable,category,percent
0,"Alameda County, CA",1993,general_health_status,1.0,24.637681
1,"Alameda County, CA",1993,general_health_status,2.0,38.405797
2,"Alameda County, CA",1993,general_health_status,3.0,24.637681
3,"Alameda County, CA",1993,general_health_status,4.0,7.971014
4,"Alameda County, CA",1993,general_health_status,5.0,4.347826
5,"Alameda County, CA",1994,general_health_status,1.0,17.751479
6,"Alameda County, CA",1994,general_health_status,2.0,36.094675
7,"Alameda County, CA",1994,general_health_status,3.0,31.360947
8,"Alameda County, CA",1994,general_health_status,4.0,10.650888
9,"Alameda County, CA",1994,general_health_status,5.0,3.550296


In [7]:
# ===============================
# 4. Pivot CDC Data to Wide Format
# ===============================

cdc_wide = (
    percent_breakdowns
    .pivot_table(
        index=["county_name", "survey_year"],
        columns=["variable", "category"],
        values="percent",
        fill_value=0
    )
)

cdc_wide.columns = [
    f"{var}_{int(cat) if isinstance(cat, (int, float)) and float(cat).is_integer() else cat}"
    for var, cat in cdc_wide.columns
]

cdc_wide = cdc_wide.reset_index()

print("CDC wide shape:", cdc_wide.shape)
display(cdc_wide.head())

CDC wide shape: (492, 225)


Unnamed: 0,county_name,survey_year,any_alcohol_past_month_1,any_alcohol_past_month_2,any_alcohol_past_month_7,any_alcohol_past_month_9,any_physical_activity_1,any_physical_activity_2,any_physical_activity_7,current_smoking_frequency_1,...,general_health_status_2,general_health_status_3,general_health_status_4,general_health_status_5,general_health_status_7,general_health_status_9,smoked_100_cigarettes_1,smoked_100_cigarettes_2,smoked_100_cigarettes_7,smoked_100_cigarettes_9
0,"Alameda County, CA",1993,62.318841,36.956522,0.724638,0.0,0.0,0.0,0.0,0.0,...,38.405797,24.637681,7.971014,4.347826,0.0,0.0,41.304348,58.695652,0.0,0.0
1,"Alameda County, CA",1994,0.0,0.0,0.0,0.0,78.698225,21.301775,0.0,0.0,...,36.094675,31.360947,10.650888,3.550296,0.591716,0.0,46.745562,53.254438,0.0,0.0
2,"Alameda County, CA",1995,68.27957,31.72043,0.0,0.0,81.182796,18.817204,0.0,0.0,...,36.55914,32.258065,6.451613,2.688172,0.0,0.0,47.849462,51.075269,1.075269,0.0
3,"Alameda County, CA",1996,63.190184,36.809816,0.0,0.0,84.04908,15.95092,0.0,29.487179,...,34.355828,29.447853,4.907975,3.067485,0.0,0.0,47.852761,52.147239,0.0,0.0
4,"Alameda County, CA",1997,73.053892,26.946108,0.0,0.0,0.0,0.0,0.0,21.794872,...,43.113772,22.155689,7.784431,1.197605,0.0,0.0,46.706587,53.293413,0.0,0.0


In [9]:
# ===============================
# 5. Apply Readable Labels (Codebook)
# ===============================

codebook_map = {
    "general_health_status": {
        1: "Excellent", 2: "Very good", 3: "Good", 4: "Fair", 5: "Poor",
        7: "Don’t know/Not sure", 9: "Refused"
    },
    "ever_told_high_bp": {
        1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"
    },
    "ever_told_diabetes": {
        1: "Yes", 2: "No", 3: "Yes, during pregnancy", 4: "Pre-diabetes/Borderline",
        7: "Don’t know/Not sure", 9: "Refused"
    },
    "smoked_100_cigarettes": {
        1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"
    },
    "current_smoking_frequency": {
        1: "Every day", 2: "Some days", 3: "Not at all",
        7: "Don’t know/Not sure", 9: "Refused"
    },
    "any_alcohol_past_month": {
        1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"
    },
    "eats_fruit": {
        1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"
    },
    "eats_other_vegetables": {
        1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"
    },
    "any_physical_activity": {
        1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"
    }
}

def decode_food_code(val):
    if val in [1, 2, 7, 9]:
        return {1: "Yes", 2: "No", 7: "Don’t know/Not sure", 9: "Refused"}[val]
    elif 101 <= val <= 199:
        return f"{val - 100} times/day"
    elif 201 <= val <= 299:
        return f"{val - 200} times/week"
    elif 301 <= val <= 399:
        return f"{val - 300} times/month"
    else:
        return f"Code {val}"

rename_map = {}
for col in cdc_wide.columns:
    if col in ["county_name", "survey_year"]:
        continue
    var, cat = col.rsplit("_", 1)
    try:
        cat_int = int(cat)
    except:
        cat_int = cat
    
    if var in codebook_map and cat_int in codebook_map[var]:
        rename_map[col] = f"{var} - {codebook_map[var][cat_int]}"
    elif var in ["eats_fruit", "eats_other_vegetables"]:
        rename_map[col] = f"{var} - {decode_food_code(cat_int)}"
    else:
        rename_map[col] = col

cdc_wide = cdc_wide.rename(columns=rename_map)

print("CDC wide with labels shape:", cdc_wide.shape)
display(cdc_wide.head())

CDC wide with labels shape: (492, 225)


Unnamed: 0,county_name,survey_year,any_alcohol_past_month - Yes,any_alcohol_past_month - No,any_alcohol_past_month - Don’t know/Not sure,any_alcohol_past_month - Refused,any_physical_activity - Yes,any_physical_activity - No,any_physical_activity - Don’t know/Not sure,current_smoking_frequency - Every day,...,general_health_status - Very good,general_health_status - Good,general_health_status - Fair,general_health_status - Poor,general_health_status - Don’t know/Not sure,general_health_status - Refused,smoked_100_cigarettes - Yes,smoked_100_cigarettes - No,smoked_100_cigarettes - Don’t know/Not sure,smoked_100_cigarettes - Refused
0,"Alameda County, CA",1993,62.318841,36.956522,0.724638,0.0,0.0,0.0,0.0,0.0,...,38.405797,24.637681,7.971014,4.347826,0.0,0.0,41.304348,58.695652,0.0,0.0
1,"Alameda County, CA",1994,0.0,0.0,0.0,0.0,78.698225,21.301775,0.0,0.0,...,36.094675,31.360947,10.650888,3.550296,0.591716,0.0,46.745562,53.254438,0.0,0.0
2,"Alameda County, CA",1995,68.27957,31.72043,0.0,0.0,81.182796,18.817204,0.0,0.0,...,36.55914,32.258065,6.451613,2.688172,0.0,0.0,47.849462,51.075269,1.075269,0.0
3,"Alameda County, CA",1996,63.190184,36.809816,0.0,0.0,84.04908,15.95092,0.0,29.487179,...,34.355828,29.447853,4.907975,3.067485,0.0,0.0,47.852761,52.147239,0.0,0.0
4,"Alameda County, CA",1997,73.053892,26.946108,0.0,0.0,0.0,0.0,0.0,21.794872,...,43.113772,22.155689,7.784431,1.197605,0.0,0.0,46.706587,53.293413,0.0,0.0


In [11]:
# ===============================
# 6. Merge CDC Health and BLS Labor Data
# ===============================

merged = pd.merge(
    cdc_wide,
    bls_summary,
    on=["county_name", "survey_year"],
    how="inner"   # inner join: only keep counties/years that appear in both
)

print("Final merged shape:", merged.shape)
display(merged.head())

Final merged shape: (474, 237)


Unnamed: 0,county_name,survey_year,any_alcohol_past_month - Yes,any_alcohol_past_month - No,any_alcohol_past_month - Don’t know/Not sure,any_alcohol_past_month - Refused,any_physical_activity - Yes,any_physical_activity - No,any_physical_activity - Don’t know/Not sure,current_smoking_frequency - Every day,...,employment_max,unemployment_mean,unemployment_min,unemployment_max,labor_force_mean,labor_force_min,labor_force_max,unemployment_rate_mean,unemployment_rate_min,unemployment_rate_max
0,"Alameda County, CA",1993,62.318841,36.956522,0.724638,0.0,0.0,0.0,0.0,0.0,...,639235.0,677319.923077,669864.0,681111.0,45298.846154,39404.0,49620.0,6.676923,5.8,7.3
1,"Alameda County, CA",1994,0.0,0.0,0.0,0.0,78.698225,21.301775,0.0,0.0,...,643742.0,678967.769231,675462.0,684694.0,42059.0,32608.0,46901.0,6.192308,4.8,6.9
2,"Alameda County, CA",1995,68.27957,31.72043,0.0,0.0,81.182796,18.817204,0.0,0.0,...,642535.0,676034.846154,668883.0,684352.0,38880.307692,32278.0,44668.0,5.776923,4.8,6.5
3,"Alameda County, CA",1996,63.190184,36.809816,0.0,0.0,84.04908,15.95092,0.0,29.487179,...,659950.0,677688.461538,665654.0,689632.0,34591.769231,28610.0,39006.0,5.107692,4.2,5.7
4,"Alameda County, CA",1997,73.053892,26.946108,0.0,0.0,0.0,0.0,0.0,21.794872,...,678307.0,693262.461538,683088.0,703102.0,31132.692308,24795.0,34815.0,4.5,3.5,5.0


In [13]:
# Save merged dataset to CSV
merged.to_csv("california_health_bls_merged.csv", index=False)

print("✅ Exported merged dataset to california_health_bls_merged.csv")

✅ Exported merged dataset to california_health_bls_merged.csv
