# Predicting Heart Disease

## Score: .95317

In [37]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

DATA_DIR = Path("playground-series-s6e2")
OUTPUT_DIR = Path(".")

In [38]:
train = pd.read_csv(DATA_DIR / "train.csv")
test = pd.read_csv(DATA_DIR / "test.csv")
original = pd.read_csv(Path("original-data") / "Heart_Disease_Prediction.csv")
np.random.seed(42)
n_copies = 50
noise_scale = 1.5
aug_rows = []
for _, row in original.iterrows():
    for _ in range(n_copies):
        new_row = row.copy()
        for col in original.columns:
            if col == "Heart Disease":
                continue
            new_row[col] = row[col] + np.random.normal(0, noise_scale)
        aug_rows.append(new_row)
aug = pd.DataFrame(aug_rows)
feature_cols_aug = [c for c in aug.columns if c != "Heart Disease"]
for col in feature_cols_aug:
    aug[col] = np.clip(aug[col].round(), 0, 500)
aug["id"] = -1
train = pd.concat([train, aug], ignore_index=True)
print(f"Train (synthetic + {len(aug)} augmented from original): {train.shape}")
print(f"Test: {test.shape}")
train.head()

Train (synthetic + 13500 augmented from original): (643500, 15)
Test: (270000, 14)


Unnamed: 0,id,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,0,58.0,1.0,4.0,152.0,239.0,0.0,0.0,158.0,1.0,3.6,2.0,2.0,7.0,Presence
1,1,52.0,1.0,1.0,125.0,325.0,0.0,2.0,171.0,0.0,0.0,1.0,0.0,3.0,Absence
2,2,56.0,0.0,2.0,160.0,188.0,0.0,2.0,151.0,0.0,0.0,1.0,0.0,3.0,Absence
3,3,44.0,0.0,3.0,134.0,229.0,0.0,2.0,150.0,0.0,1.0,2.0,0.0,3.0,Absence
4,4,58.0,1.0,4.0,140.0,234.0,0.0,2.0,125.0,1.0,3.8,2.0,3.0,3.0,Presence


In [39]:
target_col = "Heart Disease"
id_col = "id"
feature_cols = [c for c in train.columns if c not in (id_col, target_col)]

le = LabelEncoder()
y = le.fit_transform(train[target_col])

X_train = train[feature_cols].copy()
X_test = test[feature_cols].copy()

for col in feature_cols:
    if X_train[col].isna().any() or X_test[col].isna().any():
        med = X_train[col].median()
        X_train[col] = X_train[col].fillna(med)
        X_test[col] = X_test[col].fillna(med)

print(f"Features: {feature_cols}")
print(f"Target distribution: {pd.Series(y).value_counts().to_dict()}")

Features: ['Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol', 'FBS over 120', 'EKG results', 'Max HR', 'Exercise angina', 'ST depression', 'Slope of ST', 'Number of vessels fluro', 'Thallium']
Target distribution: {0: 355046, 1: 288454}


In [40]:
n_splits = 5
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
model = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=5,
    min_child_weight=3,
    learning_rate=0.05,
    subsample=0.7,
    colsample_bytree=0.7,
    random_state=42,
    eval_metric="auc",
)

oof_proba = cross_val_predict(model, X_train, y, cv=cv, method="predict_proba")[:, 1]
cv_auc = roc_auc_score(y, oof_proba)
print(f"CV ROC AUC: {cv_auc:.5f}")

CV ROC AUC: 0.95458


In [41]:
def avg_proba(models_proba):
    return np.mean(models_proba, axis=0)

xgb_seeds = [42, 43, 44, 45, 46]
xgb_list = []
for seed in xgb_seeds:
    m = xgb.XGBClassifier(n_estimators=500, max_depth=5, min_child_weight=3, learning_rate=0.05, subsample=0.7, colsample_bytree=0.7, random_state=seed, eval_metric="auc")
    m.fit(X_train, y)
    xgb_list.append(m.predict_proba(X_test)[:, 1])
xgb_avg = avg_proba(xgb_list)

lgb_seeds = [42, 43, 44]
lgb_list = []
for seed in lgb_seeds:
    m = lgb.LGBMClassifier(n_estimators=500, max_depth=5, min_child_samples=20, learning_rate=0.05, subsample=0.7, colsample_bytree=0.7, random_state=seed, verbosity=-1)
    m.fit(X_train, y)
    lgb_list.append(m.predict_proba(X_test)[:, 1])
lgb_avg = avg_proba(lgb_list)

cb_seeds = [42, 43, 44]
cb_list = []
for seed in cb_seeds:
    m = cb.CatBoostClassifier(iterations=500, depth=5, min_data_in_leaf=20, learning_rate=0.05, subsample=0.7, colsample_bylevel=0.7, random_seed=seed, verbose=0)
    m.fit(X_train, y)
    cb_list.append(m.predict_proba(X_test)[:, 1])
cb_avg = avg_proba(cb_list)

test_proba = (xgb_avg + lgb_avg + cb_avg) / 3

[WinError 2] The system cannot find the file specified
  File "c:\Users\ol1v3_7dwns5u\AppData\Local\Programs\Python\Python313\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "c:\Users\ol1v3_7dwns5u\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ol1v3_7dwns5u\AppData\Local\Programs\Python\Python313\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^

In [42]:
sub = pd.DataFrame({"id": test["id"], "Heart Disease": test_proba})
sub.to_csv(OUTPUT_DIR / "submission.csv", index=False)
print(f"Submission saved to {OUTPUT_DIR / 'submission.csv'}")
sub.head(10)

Submission saved to submission.csv


Unnamed: 0,id,Heart Disease
0,630000,0.939145
1,630001,0.007492
2,630002,0.983806
3,630003,0.005087
4,630004,0.198219
5,630005,0.983159
6,630006,0.007849
7,630007,0.670108
8,630008,0.99183
9,630009,0.013779
