# Predicting Heart Disease

## Score: .95349

In [13]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

DATA_DIR = Path("playground-series-s6e2")
OUTPUT_DIR = Path(".")

In [14]:
train = pd.read_csv(DATA_DIR / "train.csv")
test = pd.read_csv(DATA_DIR / "test.csv")
print(f"Train: {train.shape}")
print(f"Test:  {test.shape}")
train.head()

Train: (630000, 15)
Test:  (270000, 14)


Unnamed: 0,id,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,0,58,1,4,152,239,0,0,158,1,3.6,2,2,7,Presence
1,1,52,1,1,125,325,0,2,171,0,0.0,1,0,3,Absence
2,2,56,0,2,160,188,0,2,151,0,0.0,1,0,3,Absence
3,3,44,0,3,134,229,0,2,150,0,1.0,2,0,3,Absence
4,4,58,1,4,140,234,0,2,125,1,3.8,2,3,3,Presence


In [15]:
target_col = "Heart Disease"
id_col = "id"
feature_cols = [c for c in train.columns if c not in (id_col, target_col)]

le = LabelEncoder()
y = le.fit_transform(train[target_col])

X_train = train[feature_cols].copy()
X_test = test[feature_cols].copy()

for col in feature_cols:
    if X_train[col].isna().any() or X_test[col].isna().any():
        med = X_train[col].median()
        X_train[col] = X_train[col].fillna(med)
        X_test[col] = X_test[col].fillna(med)

print(f"Features: {feature_cols}")
print(f"Target distribution: {pd.Series(y).value_counts().to_dict()}")

Features: ['Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol', 'FBS over 120', 'EKG results', 'Max HR', 'Exercise angina', 'ST depression', 'Slope of ST', 'Number of vessels fluro', 'Thallium']
Target distribution: {0: 347546, 1: 282454}


In [16]:
n_splits = 5
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
model = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=5,
    min_child_weight=3,
    learning_rate=0.05,
    subsample=0.7,
    colsample_bytree=0.7,
    random_state=42,
    eval_metric="auc",
)

oof_proba = cross_val_predict(model, X_train, y, cv=cv, method="predict_proba")[:, 1]
cv_auc = roc_auc_score(y, oof_proba)
print(f"CV ROC AUC: {cv_auc:.5f}")

CV ROC AUC: 0.95532


In [17]:
seeds = [42, 43, 44, 45, 46]
test_proba_list = []
for seed in seeds:
    m = xgb.XGBClassifier(
        n_estimators=500,
        max_depth=5,
        min_child_weight=3,
        learning_rate=0.05,
        subsample=0.7,
        colsample_bytree=0.7,
        random_state=seed,
        eval_metric="auc",
    )
    m.fit(X_train, y)
    test_proba_list.append(m.predict_proba(X_test)[:, 1])
test_proba = np.mean(test_proba_list, axis=0)

In [18]:
sub = pd.DataFrame({"id": test["id"], "Heart Disease": test_proba})
sub.to_csv(OUTPUT_DIR / "submission.csv", index=False)
print(f"Submission saved to {OUTPUT_DIR / 'submission.csv'}")
sub.head(10)

Submission saved to submission.csv


Unnamed: 0,id,Heart Disease
0,630000,0.958176
1,630001,0.00922
2,630002,0.986098
3,630003,0.00594
4,630004,0.180464
5,630005,0.981826
6,630006,0.005233
7,630007,0.624753
8,630008,0.991508
9,630009,0.013859
