# ❤️ Cardiovascular Disease Risk Prediction

This notebook covers the training of the Heart Disease classification model using XGBoost.

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
import joblib
import shap

# Load Large Scale Data
df = pd.read_csv('../data/raw/heart_raw.csv')
print(f"Dataset Size: {len(df)}")

In [None]:
X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = XGBClassifier(n_estimators=300, learning_rate=0.03, max_depth=7)
model.fit(X_train_scaled, y_train)

probs = model.predict_proba(X_test_scaled)[:, 1]
print(f"AUC Score: {roc_auc_score(y_test, probs):.4f}")

## Feature Importance
Using SHAP to understand model decisions.

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test_scaled)
shap.summary_plot(shap_values, X_test, plot_type='bar')