In [None]:
# Cell 1: Import Libraries
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

import joblib

In [None]:
# Cell 2: Load Dataset
data = pd.read_csv("dataset/archive/heart_failure_clinical_records_dataset.csv")
print("Shape of dataset:", data.shape)
data.head()

In [None]:
# Cell 3: Data Info and Null Values
data.info()
print("Total null values in dataset:", data.isnull().sum().sum())

In [None]:
# Cell 4: Exploratory Data Analysis (EDA)
count_alive = data[data.DEATH_EVENT == 0].shape[0]
count_died = data[data.DEATH_EVENT == 1].shape[0]
print(f"Alive: {count_alive}, Died: {count_died}")

plt.figure(figsize=(6,6))
plt.pie([count_alive, count_died], labels=["Alive", "Died"], explode=[0.1,0], shadow=True, autopct="%1.1f%%")
plt.title("Death Event Distribution")
plt.show()

plt.figure(figsize=(8,5))
sns.histplot(data['age'], bins=30, kde=True)
plt.title("Age Distribution")
plt.show()

plt.figure(figsize=(12,10))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

In [None]:
# Cell 5: Split Dataset
X = data.drop('DEATH_EVENT', axis=1)
y = data['DEATH_EVENT']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

In [None]:
# Cell 6: Feature Engineering - Interaction Terms (Optional)
def add_interactions(X):
    X_int = X.copy()
    features = X.columns
    for i in range(len(features)):
        for j in range(i+1, len(features)):
            X_int[f"{features[i]}_x_{features[j]}"] = X[features[i]] * X[features[j]]
    return X_int

# Uncomment below to enable interaction features
# X_train_mod = add_interactions(X_train)
# X_test_mod = add_interactions(X_test)

In [None]:
# Cell 7: Define Evaluation Function
def evaluate_model(y_true, y_pred):
    print("Accuracy Score:", accuracy_score(y_true, y_pred))
    print("Precision Score:", precision_score(y_true, y_pred))
    print("Recall Score:", recall_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

In [None]:
# Cell 8: Logistic Regression Baseline
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print("Logistic Regression Results:")
evaluate_model(y_test, y_pred_lr)

In [None]:
# Cell 9: Logistic Regression with StandardScaler
lr_pipe = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000))
lr_pipe.fit(X_train, y_train)
y_pred_lr_pipe = lr_pipe.predict(X_test)
print("Logistic Regression with Scaler Results:")
evaluate_model(y_test, y_pred_lr_pipe)

In [None]:
# Cell 10: Support Vector Machine with Grid Search
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf']
}

grid_svc = GridSearchCV(SVC(probability=True), param_grid, refit=True, verbose=1, cv=5)
grid_svc.fit(X_train, y_train)

print("Best SVM parameters:", grid_svc.best_params_)

y_pred_svc = grid_svc.predict(X_test)
print("SVM Model Results:")
evaluate_model(y_test, y_pred_svc)

In [None]:
# Cell 11: Decision Tree with Randomized Search
param_dist = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 2, 4, 6, 8],
    'min_samples_split': [2, 4, 6, 8],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 0.75, 0.9]
}

rand_search_dt = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), param_dist, n_iter=20, cv=5, n_jobs=-1, random_state=42)
rand_search_dt.fit(X_train, y_train)

print("Best Decision Tree params:", rand_search_dt.best_params_)

y_pred_dt = rand_search_dt.predict(X_test)
print("Decision Tree Results:")
evaluate_model(y_test, y_pred_dt)

In [None]:
# Cell 12: Random Forest with Randomized Search
param_dist_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 2, 4, 6, 8],
    'min_samples_leaf': [1, 2, 4, 6],
    'max_features': ['auto', 0.5, 0.7],
    'min_impurity_decrease': [0.0, 0.01, 0.05]
}

rand_search_rf = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_dist_rf, n_iter=20, cv=5, n_jobs=-1, random_state=42)
rand_search_rf.fit(X_train, y_train)

print("Best Random Forest params:", rand_search_rf.best_params_)

y_pred_rf = rand_search_rf.predict(X_test)
print("Random Forest Results:")
evaluate_model(y_test, y_pred_rf)

In [None]:
# Cell 13: XGBoost Classifier
from xgboost import XGBClassifier, plot_importance

xgb = XGBClassifier(
    n_estimators=400,
    max_depth=4,
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

xgb.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_test, y_test)], verbose=False)
y_pred_xgb = xgb.predict(X_test)

print("XGBoost Results:")
evaluate_model(y_test, y_pred_xgb)

plot_importance(xgb)
plt.show()

In [None]:
# Cell 14: Gradient Boosting Classifier
gbdt = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=1, random_state=42)
gbdt.fit(X_train, y_train)
y_pred_gbdt = gbdt.predict(X_test)

print("Gradient Boosting Results:")
evaluate_model(y_test, y_pred_gbdt)

In [None]:
# Cell 15: Save Best Model Example (XGBoost)
joblib.dump(xgb, 'heart_failure_xgb_model.pkl')
print("Model saved as 'heart_failure_xgb_model.pkl'")

In [None]:
# Cell 16: Load & Predict using saved model
model = joblib.load('heart_failure_xgb_model.pkl')
sample_preds = model.predict(X_test)
print("Sample predictions:", sample_preds[:10])