# Predicting Heart Disease

## Score: .95328

In [7]:
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

DATA_DIR = Path("playground-series-s6e2")
OUTPUT_DIR = Path(".")

In [8]:
train = pd.read_csv(DATA_DIR / "train.csv")
test = pd.read_csv(DATA_DIR / "test.csv")
print(f"Train: {train.shape}")
print(f"Test:  {test.shape}")
train.head()

Train: (630000, 15)
Test:  (270000, 14)


Unnamed: 0,id,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,0,58,1,4,152,239,0,0,158,1,3.6,2,2,7,Presence
1,1,52,1,1,125,325,0,2,171,0,0.0,1,0,3,Absence
2,2,56,0,2,160,188,0,2,151,0,0.0,1,0,3,Absence
3,3,44,0,3,134,229,0,2,150,0,1.0,2,0,3,Absence
4,4,58,1,4,140,234,0,2,125,1,3.8,2,3,3,Presence


In [9]:
target_col = "Heart Disease"
id_col = "id"
feature_cols = [c for c in train.columns if c not in (id_col, target_col)]

le = LabelEncoder()
y = le.fit_transform(train[target_col])

X_train = train[feature_cols].copy()
X_test = test[feature_cols].copy()

for col in feature_cols:
    if X_train[col].isna().any() or X_test[col].isna().any():
        med = X_train[col].median()
        X_train[col] = X_train[col].fillna(med)
        X_test[col] = X_test[col].fillna(med)

pred_max_hr_train = (220 - X_train["Age"]).clip(lower=1)
pred_max_hr_test = (220 - X_test["Age"]).clip(lower=1)
X_train["max_hr_pct"] = X_train["Max HR"] / pred_max_hr_train
X_test["max_hr_pct"] = X_test["Max HR"] / pred_max_hr_test

X_train["bp_chol"] = (X_train["BP"] * X_train["Cholesterol"]) / 1e4
X_test["bp_chol"] = (X_test["BP"] * X_test["Cholesterol"]) / 1e4

X_train["st_dep_angina"] = X_train["ST depression"] * X_train["Exercise angina"]
X_test["st_dep_angina"] = X_test["ST depression"] * X_test["Exercise angina"]

X_train["st_dep_slope"] = X_train["ST depression"] * X_train["Slope of ST"]
X_test["st_dep_slope"] = X_test["ST depression"] * X_test["Slope of ST"]

X_train["vessels_thallium"] = X_train["Number of vessels fluro"] * X_train["Thallium"]
X_test["vessels_thallium"] = X_test["Number of vessels fluro"] * X_test["Thallium"]

X_train["metabolic"] = X_train["FBS over 120"] * (X_train["Cholesterol"] / 100)
X_test["metabolic"] = X_test["FBS over 120"] * (X_test["Cholesterol"] / 100)

X_train["age_bp"] = (X_train["Age"] * X_train["BP"]) / 100
X_test["age_bp"] = (X_test["Age"] * X_test["BP"]) / 100

X_train["sex_chestpain"] = X_train["Sex"] * X_train["Chest pain type"]
X_test["sex_chestpain"] = X_test["Sex"] * X_test["Chest pain type"]

print(f"Features: {len(X_train.columns)} columns")
print(f"Target distribution: {pd.Series(y).value_counts().to_dict()}")

Features: 21 columns
Target distribution: {0: 347546, 1: 282454}


In [10]:
n_splits = 5
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
model = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="auc",
)

oof_proba = cross_val_predict(model, X_train, y, cv=cv, method="predict_proba")[:, 1]
cv_auc = roc_auc_score(y, oof_proba)
print(f"CV ROC AUC: {cv_auc:.5f}")

CV ROC AUC: 0.95515


In [11]:
model.fit(X_train, y)
test_proba = model.predict_proba(X_test)[:, 1]

In [12]:
sub = pd.DataFrame({"id": test["id"], "Heart Disease": test_proba})
sub.to_csv(OUTPUT_DIR / "submission.csv", index=False)
print(f"Submission saved to {OUTPUT_DIR / 'submission.csv'}")
sub.head(10)

Submission saved to submission.csv


Unnamed: 0,id,Heart Disease
0,630000,0.947601
1,630001,0.011489
2,630002,0.987816
3,630003,0.006934
4,630004,0.178209
5,630005,0.982828
6,630006,0.006913
7,630007,0.609313
8,630008,0.991441
9,630009,0.014698
