In [1]:
import pandas as pd

gemini_features = [
    "AGEP_A",          # age (years, 18‑85 top‑coded)
    "SEX_A",           # sex
    "HISPALLP_A",      # combined race / ethnicity
    "EDUCP_A",         # education level
    "BMICAT_A",        # BMI category
    "SMKCIGST_A",      # smoking status
    "HYPEV_A",         # ever hypertension
    "CHLEV_A",         # ever high cholesterol
    "DIBEV_A",         # ever diabetes
    "PHSTAT_A",        # self‑rated health
    "DEPEV_A",         # ever depression
    "COPDEV_A",        # ever COPD / chronic bronchitis / emphysema
    "STREV_A"          # ever stroke
]

target_columns = ['CHDEV_A', 'ANGEV_A', 'MIEV_A']

binary_variables = ["HYPEV_A","CHLEV_A","DIBEV_A","DEPEV_A","COPDEV_A","STREV_A",
    *target_columns]

data_df = pd.read_csv('adult23.csv')
data_df = data_df[gemini_features + target_columns]

data_df

Unnamed: 0,AGEP_A,SEX_A,HISPALLP_A,EDUCP_A,BMICAT_A,SMKCIGST_A,HYPEV_A,CHLEV_A,DIBEV_A,PHSTAT_A,DEPEV_A,COPDEV_A,STREV_A,CHDEV_A,ANGEV_A,MIEV_A
0,67,1,3,1,3,4,1,1,2,5,1,2,1,2,2,2
1,73,1,2,8,3,1,1,2,1,3,2,2,2,1,2,1
2,48,1,3,5,4,4,2,2,2,1,2,2,2,2,2,2
3,42,2,2,9,3,3,2,2,2,1,2,2,2,2,2,2
4,50,2,2,7,2,4,2,2,2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29517,77,2,2,5,4,3,1,1,2,3,1,1,2,2,2,2
29518,59,2,2,7,3,4,2,1,2,1,1,2,2,2,2,2
29519,66,1,2,8,4,4,2,2,2,3,2,2,2,2,2,2
29520,53,2,2,7,3,1,1,2,2,2,2,2,2,2,2,2


In [2]:
common_miss = [7, 8, 9]
special_miss = {
    'AGEP_A':     [97, 98, 99],      # age
    'HISPALLP_A': [97, 98, 99],      # race / ethnicity
    'EDUCP_A':    [97, 98, 99],      # education
    'BMICAT_A':   [9],               # BMI unknown
    'SMKCIGST_A': [5, 9],            # smoking unknowns
}

def scrub(series: pd.Series) -> pd.Series:
    return series.replace(special_miss.get(series.name, []) + common_miss,
                          pd.NA)


data_df = data_df.apply(scrub)

In [3]:
LABELS = {
    "SEX_A": {
        1: "Male", 2: "Female", 7:"Refused", 8:"Not ascertained", 9:"Don't know"
    },
    "HISPALLP_A": {
        1:"Hispanic",
        2:"Non‑Hispanic White only",
        3:"Non‑Hispanic Black/African American only",
        4:"Non‑Hispanic Asian only",
        5:"Non‑Hispanic AIAN only",
        6:"Non‑Hispanic AIAN & another group",
        7:"Other / multiple races",
        97:"Refused", 98:"Not ascertained", 99:"Don't know"
    },
    "EDUCP_A": {
        0:"Never/kindergarten",
        1:"Grades 1‑11",
        2:"12th grade, no diploma",
        3:"GED",
        4:"High‑school graduate",
        5:"Some college, no degree",
        6:"Associate, occupational/technical",
        7:"Associate, academic",
        8:"Bachelor’s degree",
        9:"Master’s degree",
        10:"Professional/Doctorate",
        97:"Refused", 98:"Not ascertained", 99:"Don't know"
    },
    "BMICAT_A": {
        1:"Under‑weight (<18.5)",
        2:"Normal (18.5‑24.9)",
        3:"Over‑weight (25.0‑29.9)",
        4:"Obese (>=30)",
        9:"Unknown"
    },
    "SMKCIGST_A": {
        1:"Current – every day",
        2:"Current – some days",
        3:"Former",
        4:"Never",
        5:"Unknown", 9:"Not ascertained"
    },
    "PHSTAT_A": {
        1:"Excellent", 2:"Very good", 3:"Good",
        4:"Fair",      5:"Poor",
        7:"Refused", 8:"Not ascertained", 9:"Don't know"
    }
}

for col, mapping in LABELS.items():
    if col in data_df.columns:
        data_df[col] = data_df[col].map(mapping).astype("category")

In [4]:
yes_no = {1:1, 2:0}
data_df[binary_variables] = data_df[binary_variables].apply(lambda s: s.map(yes_no))

data_df['male'] = data_df['SEX_A'].map({1: 1, 2: 0})

data_df['heart_disease'] = (
    (data_df['CHDEV_A'] == 1) |
    (data_df['ANGEV_A'] == 1) |
    (data_df['MIEV_A']  == 1)
).astype(int)

X = data_df.drop(columns=['heart_disease'] + target_columns)
y = data_df["heart_disease"]


In [5]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import numpy as np

cat_cols = [c for c in X.columns if str(X[c].dtype) == "category"]
num_cols = [c for c in X.columns if c not in cat_cols]

X[cat_cols] = X[cat_cols].astype("string").fillna("__MISSING__")

X[num_cols] = X[num_cols].apply(pd.to_numeric, errors="coerce").astype(float)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

train_pool = Pool(X_train, y_train, cat_features=cat_cols)
test_pool  = Pool(X_test,  y_test,  cat_features=cat_cols)

model = CatBoostClassifier(
    loss_function="Logloss",
    depth=6,
    learning_rate=0.05,
    iterations=4000,
    eval_metric="F1",
    random_seed=42,
    verbose=200
)

# 5) fit and evaluate
model.fit(train_pool, eval_set=test_pool, use_best_model=True)


0:	learn: 0.0020683	test: 0.0000000	best: 0.0000000 (0)	total: 65.1ms	remaining: 4m 20s
200:	learn: 0.2128408	test: 0.1660777	best: 0.1693122 (199)	total: 1.84s	remaining: 34.8s
400:	learn: 0.2857143	test: 0.1796200	best: 0.1927711 (268)	total: 3.7s	remaining: 33.2s
600:	learn: 0.3508048	test: 0.1689655	best: 0.1927711 (268)	total: 5.58s	remaining: 31.6s
800:	learn: 0.4052970	test: 0.1663837	best: 0.1927711 (268)	total: 7.49s	remaining: 29.9s
1000:	learn: 0.4467919	test: 0.1686341	best: 0.1927711 (268)	total: 9.35s	remaining: 28s
1200:	learn: 0.4750192	test: 0.1669449	best: 0.1927711 (268)	total: 11.2s	remaining: 26.2s
1400:	learn: 0.5092279	test: 0.1647059	best: 0.1927711 (268)	total: 13.1s	remaining: 24.3s
1600:	learn: 0.5310524	test: 0.1633333	best: 0.1927711 (268)	total: 15s	remaining: 22.5s
1800:	learn: 0.5541284	test: 0.1680395	best: 0.1927711 (268)	total: 16.9s	remaining: 20.6s
2000:	learn: 0.5750452	test: 0.1642036	best: 0.1927711 (268)	total: 18.8s	remaining: 18.8s
2200:	learn

<catboost.core.CatBoostClassifier at 0x127cf4790>

In [6]:
from sklearn.metrics import f1_score

proba = model.predict_proba(test_pool)[:, 1]
y_pred = (proba >= 0.50).astype(int)
f1 = f1_score(y_test, y_pred)
print("F‑1 score:", round(f1, 3))

F‑1 score: 0.193
