In [1]:
import pandas as pd

gemini_features = ['AGEP_A', 'SEX_A', 'HISPALLP_A', 'EDUCP_A',
    'BMICAT_A', 'SMKCIGST_A',
    'HYPEV_A', 'CHLEV_A', 'DIBEV_A', 'PHSTAT_A',
    'DEPEV_A', 'COPDEV_A', 'STREV_A']

target_columns = ['CHDEV_A', 'ANGEV_A', 'MIEV_A']

data_df = pd.read_csv('adult23.csv')
data_df = data_df[gemini_features + target_columns]

data_df

Unnamed: 0,AGEP_A,SEX_A,HISPALLP_A,EDUCP_A,BMICAT_A,SMKCIGST_A,HYPEV_A,CHLEV_A,DIBEV_A,PHSTAT_A,DEPEV_A,COPDEV_A,STREV_A,CHDEV_A,ANGEV_A,MIEV_A
0,67,1,3,1,3,4,1,1,2,5,1,2,1,2,2,2
1,73,1,2,8,3,1,1,2,1,3,2,2,2,1,2,1
2,48,1,3,5,4,4,2,2,2,1,2,2,2,2,2,2
3,42,2,2,9,3,3,2,2,2,1,2,2,2,2,2,2
4,50,2,2,7,2,4,2,2,2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29517,77,2,2,5,4,3,1,1,2,3,1,1,2,2,2,2
29518,59,2,2,7,3,4,2,1,2,1,1,2,2,2,2,2
29519,66,1,2,8,4,4,2,2,2,3,2,2,2,2,2,2
29520,53,2,2,7,3,1,1,2,2,2,2,2,2,2,2,2


In [2]:
common_miss = [7, 8, 9]
special_miss = {
    'AGEP_A':     [97, 98, 99],      # age
    'HISPALLP_A': [97, 98, 99],      # race / ethnicity
    'EDUCP_A':    [97, 98, 99],      # education
    'BMICAT_A':   [9],               # BMI unknown
    'SMKCIGST_A': [5, 9],            # smoking unknowns
}

def to_na(s):
    return s.replace(special_miss.get(s.name, []) + common_miss, pd.NA)

data_df = data_df.apply(to_na)

In [3]:
binary = [
    'HYPEV_A', 'CHLEV_A', 'DIBEV_A',
    'DEPEV_A', 'COPDEV_A', 'STREV_A'
] + target_columns

data_df[binary] = data_df[binary].apply(lambda c: c.map({1: 1, 2: 0}))


In [4]:
data_df['male'] = data_df['SEX_A'].map({1: 1, 2: 0})

data_df['heart_disease'] = (
    (data_df['CHDEV_A'] == 1) |
    (data_df['ANGEV_A'] == 1) |
    (data_df['MIEV_A']  == 1)
).astype(int)

X = data_df.drop(columns=['heart_disease'] + target_columns)
y = data_df['heart_disease']

data_df

Unnamed: 0,AGEP_A,SEX_A,HISPALLP_A,EDUCP_A,BMICAT_A,SMKCIGST_A,HYPEV_A,CHLEV_A,DIBEV_A,PHSTAT_A,DEPEV_A,COPDEV_A,STREV_A,CHDEV_A,ANGEV_A,MIEV_A,male,heart_disease
0,67,1,3,1,3,4,1.0,1.0,0.0,5,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0
1,73,1,2,,3,1,1.0,0.0,1.0,3,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1
2,48,1,3,5,4,4,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,42,2,2,,3,3,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,50,2,2,,2,4,0.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29517,77,2,2,5,4,3,1.0,1.0,0.0,3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0
29518,59,2,2,,3,4,0.0,1.0,0.0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
29519,66,1,2,,4,4,0.0,0.0,0.0,3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
29520,53,2,2,,3,1,1.0,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [5]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import numpy as np

cat_cols = [
    'SEX_A', 'HISPALLP_A', 'EDUCP_A', 'BMICAT_A',
    'SMKCIGST_A', 'PHSTAT_A'
]
num_cols = X.columns.difference(cat_cols)

X[cat_cols] = (X[cat_cols]
               .astype("string")
               .fillna("__MISSING__"))

X[num_cols] = (X[num_cols]
               .replace({pd.NA: np.nan})
               .astype(float))

X[cat_cols] = X[cat_cols].astype("string")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

train_pool = Pool(X_train, y_train, cat_features=cat_cols)
test_pool  = Pool(X_test,  y_test,  cat_features=cat_cols)

model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.03,
    depth=6,
    loss_function='Logloss',
    eval_metric='F1',
    nan_mode='Min',
    random_state=42,
    verbose=100
)

# 5) fit and evaluate
model.fit(train_pool, eval_set=test_pool, use_best_model=True)


  .replace({pd.NA: np.nan})


0:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 69.6ms	remaining: 1m 9s
100:	learn: 0.0948693	test: 0.0588235	best: 0.0627451 (94)	total: 922ms	remaining: 8.21s
200:	learn: 0.1728507	test: 0.1477477	best: 0.1513514 (193)	total: 1.8s	remaining: 7.17s
300:	learn: 0.2097345	test: 0.1604278	best: 0.1604278 (300)	total: 2.65s	remaining: 6.15s
400:	learn: 0.2363716	test: 0.1786340	best: 0.1786340 (400)	total: 3.54s	remaining: 5.29s
500:	learn: 0.2591477	test: 0.1830743	best: 0.1924399 (468)	total: 4.44s	remaining: 4.43s
600:	learn: 0.2760684	test: 0.1865285	best: 0.1924399 (468)	total: 5.37s	remaining: 3.56s
700:	learn: 0.2961392	test: 0.1799308	best: 0.1924399 (468)	total: 6.27s	remaining: 2.67s
800:	learn: 0.3200336	test: 0.1761658	best: 0.1924399 (468)	total: 7.21s	remaining: 1.79s
900:	learn: 0.3369384	test: 0.1663778	best: 0.1924399 (468)	total: 8.14s	remaining: 894ms
999:	learn: 0.3504132	test: 0.1591696	best: 0.1924399 (468)	total: 9.07s	remaining: 0us

bestTest = 0.192

<catboost.core.CatBoostClassifier at 0x16b946b10>

In [8]:
from sklearn.metrics import f1_score

proba = model.predict_proba(test_pool)[:, 1]
y_pred = (proba >= 0.50).astype(int)
f1 = f1_score(y_test, y_pred)
print("F‑1 score:", round(f1, 3))

F‑1 score: 0.192
