In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score, classification_report


In [2]:
gemini_features = [
    "AGEP_A",          # age (years, 18‑85 top‑coded)
    "SEX_A",           # sex
    "HISPALLP_A",      # combined race / ethnicity
    "EDUCP_A",         # education level
    "BMICAT_A",        # BMI category
    "SMKCIGST_A",      # smoking status
    "HYPEV_A",         # ever hypertension
    "CHLEV_A",         # ever high cholesterol
    "DIBEV_A",         # ever diabetes
    "PHSTAT_A",        # self‑rated health
    "DEPEV_A",         # ever depression
    "COPDEV_A",        # ever COPD / chronic bronchitis / emphysema
    "STREV_A"          # ever stroke
]

target_columns = ['CHDEV_A', 'ANGEV_A', 'MIEV_A']

data_df = pd.read_csv('adult23.csv')
data_df = data_df[gemini_features + target_columns]

data_df

Unnamed: 0,AGEP_A,SEX_A,HISPALLP_A,EDUCP_A,BMICAT_A,SMKCIGST_A,HYPEV_A,CHLEV_A,DIBEV_A,PHSTAT_A,DEPEV_A,COPDEV_A,STREV_A,CHDEV_A,ANGEV_A,MIEV_A
0,67,1,3,1,3,4,1,1,2,5,1,2,1,2,2,2
1,73,1,2,8,3,1,1,2,1,3,2,2,2,1,2,1
2,48,1,3,5,4,4,2,2,2,1,2,2,2,2,2,2
3,42,2,2,9,3,3,2,2,2,1,2,2,2,2,2,2
4,50,2,2,7,2,4,2,2,2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29517,77,2,2,5,4,3,1,1,2,3,1,1,2,2,2,2
29518,59,2,2,7,3,4,2,1,2,1,1,2,2,2,2,2
29519,66,1,2,8,4,4,2,2,2,3,2,2,2,2,2,2
29520,53,2,2,7,3,1,1,2,2,2,2,2,2,2,2,2


In [3]:
def map_codes(col, mapping, unknown="Unknown"):
    """Map numeric codes to text; keep <NA>; anything else → 'Unknown'."""
    return (
        col.replace(mapping)          # map known codes
           .mask(col.isna(), pd.NA)   # preserve genuine <NA>
           .fillna(unknown)           # leftover unusual codes
           .astype("string")
    )

# 2‑A : direct recodes
data_df["SEX_A"]       = map_codes(data_df["SEX_A"],       {1: "Male",  2: "Female"})
data_df["HISPALLP_A"]  = map_codes(data_df["HISPALLP_A"],  {
    1: "Hispanic", 2: "White‑NH", 3: "Black‑NH", 4: "Asian‑NH",
    5: "AIAN‑NH",  6: "AIAN+Other", 7: "Other"})
data_df["EDUCP_A"]     = map_codes(data_df["EDUCP_A"], {
    0: "None/KG", 1: "1–11th", 2: "12th/no dip", 3: "GED",
    4: "HS Grad", 5: "Some College", 6: "Assoc‑Occ/Voc",
    7: "Assoc‑Acad", 8: "Bachelor", 9: "Master", 10: "Prof/PhD"})
data_df["BMICAT_A"]    = map_codes(data_df["BMICAT_A"], {
    1: "Under", 2: "Normal", 3: "Over", 4: "Obese", 5: "ExtObese"})
data_df["SMKCIGST_A"]  = map_codes(data_df["SMKCIGST_A"], {
    1: "Every day", 2: "Some days", 3: "Former", 4: "Never"})
data_df["PHSTAT_A"]    = map_codes(data_df["PHSTAT_A"], {
    1: "Excellent", 2: "Very good", 3: "Good", 4: "Fair", 5: "Poor"})

# 2‑B : three‑level Yes/No/Unknown recodes
binary3 = ["HYPEV_A", "CHLEV_A", "DIBEV_A",
           "DEPEV_A", "COPDEV_A", "STREV_A"]

for col in binary3:
    data_df[col] = map_codes(data_df[col], {1: "Yes", 2: "No"})


In [4]:
data_df["heart_disease"] = (
    (data_df["CHDEV_A"] == 1) | (data_df["ANGEV_A"] == 1) | (data_df["MIEV_A"] == 1)
).astype(int)

X = data_df.drop(columns=["heart_disease"] + target_columns)
y = data_df["heart_disease"]

# Categorical columns are EVERY column in X (even age will be left numeric)
cat_cols = X.columns.tolist()           # all treated as categorical except age
cat_cols.remove("AGEP_A")               # keep age numeric

In [5]:
ohe = OneHotEncoder(
        sparse_output=False,
        handle_unknown="ignore"
      )

ct = ColumnTransformer(
        [("onehot", ohe, cat_cols)],
        remainder="passthrough"
     )

X_encoded = ct.fit_transform(X)

feature_names = (
    ct.named_transformers_["onehot"]
      .get_feature_names_out(cat_cols)
      .tolist() + ["AGEP_A"]
)
X_final = pd.DataFrame(X_encoded, columns=feature_names, index=X.index)

# sanity check – no NA left
assert X_final.isna().sum().sum() == 0, "still missing values!"

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.20, stratify=y, random_state=42
)

models = {
    "Logistic Regression":
        Pipeline([
            ("scale", StandardScaler(with_mean=False)),
            ("clf",   LogisticRegression(max_iter=1000,
                                         class_weight="balanced",
                                         n_jobs=-1))
        ]),
    "Random Forest":
        RandomForestClassifier(
            n_estimators=1000,
            class_weight="balanced",
            random_state=42,
            n_jobs=-1
        ),
    "Gradient Boosting":
        GradientBoostingClassifier(
            n_estimators=1000,
            learning_rate=0.05,
            max_depth=3,
            random_state=42
        )
}

print("Model performance (F‑1 on held‑out 20 % split)\n" + "-"*48)
for name, clf in models.items():
    clf.fit(X_train, y_train)
    f1 = f1_score(y_test, clf.predict(X_test))
    print(f"{name:20s}  F‑1 = {f1:.3f}")

Model performance (F‑1 on held‑out 20 % split)
------------------------------------------------
Logistic Regression   F‑1 = 0.344
Random Forest         F‑1 = 0.176
Gradient Boosting     F‑1 = 0.202
