## load data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.base import clone
from sklearn.metrics import make_scorer, accuracy_score, recall_score, roc_auc_score, f1_score,\
precision_score, roc_auc_score, classification_report, fbeta_score
from sklearn.pipeline import Pipeline
from scipy.stats import randint

import warnings
warnings.filterwarnings("ignore")

In [2]:
data_path = '../data/heart_disease.csv'

df = pd.read_csv(data_path)

NUMERIC_COLS = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
CATEGORICAL_COLS = [f for f in df.columns if f not in NUMERIC_COLS]

# SELECTED_FEATS = ['oldpeak', 'thalach', 'ca', 'cp_4', 'exang', 'sex']

# NUMERIC_COLS = list(set(NUMERIC_COLS) & set(SELECTED_FEATS))
# CATEGORICAL_COLS = list(set(CATEGORICAL_COLS) & set(SELECTED_FEATS))

df.head(2)

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,slope,ca,target,cp_2,cp_3,cp_4,restecg_1,restecg_2,thal_6,thal_7
0,63.0,1.0,145.0,233.0,1.0,150.0,0.0,2.3,3.0,0.0,0,0,0,0,0,1,1,0
1,67.0,1.0,160.0,286.0,0.0,108.0,1.0,1.5,2.0,3.0,1,0,0,1,0,1,0,0


In [3]:
# split the data
X = df.drop("target", axis=1)
y = df["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Models Pipelines And Functions

- Best three models that showed **high accuracy**:
    - Random forest
    - Support vector machine
    - Logistic regression

In [4]:
CATEGORICAL_COLS.remove('target')

In [5]:
# preprocessor that scales only numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), NUMERIC_COLS),
        ('cat', 'passthrough', CATEGORICAL_COLS)
    ]
)
MODELS = ["Random Forest", "SVM", "Logistic Regression"] # referred to as 0, 1, 2 (don't change order)
METRICS = {x:None for x in MODELS} # metrics for each model
SCORING = make_scorer(fbeta_score, beta=2) # f2-scorer

- The **F2-score** optimally balances the clinical reality that false negatives (missed heart disease) are **more costly** than false positives in most healthcare scenarios.

In [6]:
# Baseline function
def baseline_model(pipeline, model):
    """calcualate the baseline model scores on train data using cross validation and test data
    and save it in metrics """
    # used Baseline model clone to not affect the original pipeline
    baseline = clone(pipeline)

    # Calculate the Cross-Validation Score on the training data
    cv_scores = cross_val_score(
        baseline,
        X_train,
        y_train,
        scoring=SCORING,
        cv=10,
        n_jobs=-1
    )
    avg_cv_score = np.mean(cv_scores)
    
    # Fit the model
    baseline.fit(X_train, y_train)
    baseline_score = fbeta_score(y_test, baseline.predict(X_test), beta=2)
    
    print(f"Baseline F2 score (CV): {avg_cv_score:.4f}")
    print(f"Baseline F2 score (Test): {baseline_score:.4f}")
    
    # save the scores
    METRICS[model] = {'Baseline_CV':avg_cv_score, 'Baseline_Test':baseline_score}

In [7]:
# Step 1: Random Grid Search Function
def rand_search(pipeline, params):
    """returns best parameters from phase 1"""
    print("=== PHASE 1: RANDOM_GRIDSEARCH ===")
    random_search = RandomizedSearchCV(
        pipeline,
        params,
        n_iter=50,
        cv=10, 
        scoring=SCORING,
        n_jobs=-1,
        random_state=42
    )
    random_search.fit(X_train, y_train)

    print("Best Random Search parameters (Phase 1):")
    for key, value in random_search.best_params_.items():
        print(f"  {key}: {value}")
    print(f"Best CV score:{random_search.best_score_:.3f}")

    return random_search.best_params_

In [8]:
# Step 2: Grid Search Function
def grid_search(pipeline, params):
    """returns grid search object from phase 2"""
    print("=== PHASE 2: GRIDSEARCH ===")
    grid_search = GridSearchCV(
        pipeline,
        params,
        cv=10,
        scoring=SCORING,
        n_jobs=-1,
        verbose=1,
        return_train_score=True
    )

    grid_search.fit(X_train, y_train)

    print("Best Grid Search parameters (Phase 2):")
    for key, value in grid_search.best_params_.items():
        print(f"  {key}: {value}")
    print(f"Best CV Score: {grid_search.best_score_:.3f}")
    
    return grid_search

In [9]:
# Evaluate the model on Test Data
def evaluate(grid_search):
    """returns a dictionary of error metrics"""
    best_est = grid_search.best_estimator_
    y_pred = best_est.predict(X_test)
    y_pred_proba = best_est.predict_proba(X_test)[:, 1]

    dic = {
        'Best_Params': grid_search.best_params_,
        'Best_CV': grid_search.best_score_,
        'Best_Test': fbeta_score(y_test, y_pred, beta=2),
        'Best_Test_Accuracy': accuracy_score(y_test, y_pred),
        'Best_Test_Precision': precision_score(y_test, y_pred),
        'Best_Test_Recall': recall_score(y_test, y_pred),
    }
    
    return dic

# Print the results
def print_eval(dic):
    """prints evaluation results on test data"""
    print("==== PERFORMANCE ====")
    for metric, value in dic.items():  # Skip first items
        if metric in ["Best_Params"]:
            continue
        print(f"{metric}: {value:.3f}")

# Random Forest Hyperparameter Tuning

#### 1. Random forest Basline

In [10]:
# Random Forest Pipeline
rf_pipeline = Pipeline([
#     ('preprocessor', preprocessor), # since binary trees doesn't need scaling
    ('cls', RandomForestClassifier(random_state=42))
])
# Baseline Random Forest score
baseline_model(rf_pipeline, MODELS[0])

Baseline F2 score (CV): 0.7636
Baseline F2 score (Test): 0.9441


#### 2. Random forest grid search (phase 1): 
- Using **RandomizedSearchCV** with variety of parameters

In [11]:
# Random Search for Random Forest Parameter Grid
rf_param_dist = {
    'cls__n_estimators': randint(50, 300), # minimum number of samples at a leaf node
    'cls__max_depth': randint(3, 15), # maximum depth of each tree.
    'cls__min_samples_split': randint(2, 20), # minimum number of samples to split a node.
    'cls__min_samples_leaf': randint(1, 10), # minimum number of samples at a leaf node.
    'cls__max_features': ['sqrt', 'log2', 0.5, 0.7], # number of features considered at each split.
    'cls__bootstrap': [True, False]
}

best = rand_search(rf_pipeline, rf_param_dist)

=== PHASE 1: RANDOM_GRIDSEARCH ===
Best Random Search parameters (Phase 1):
  cls__bootstrap: False
  cls__max_depth: 8
  cls__max_features: log2
  cls__min_samples_leaf: 5
  cls__min_samples_split: 2
  cls__n_estimators: 253
Best CV score:0.781


#### 3. Random forest grid search (phase 2):
- Using **GridSearchCV** with more specific parametrs

In [12]:
# Grid Search for Random Forest Parameter Grid
rf_param_grid = {
    'cls__n_estimators': [int(best['cls__n_estimators'] * 0.9), best['cls__n_estimators'], int(best['cls__n_estimators'] * 1.1)],
    'cls__max_depth': [best['cls__max_depth'] - 1, best['cls__max_depth'], best['cls__max_depth'] + 1],
    'cls__min_samples_split': [best['cls__min_samples_split'], best['cls__min_samples_split']+1, best['cls__min_samples_split']+2],
    'cls__min_samples_leaf': [max(1, best['cls__min_samples_leaf']-1), best['cls__min_samples_leaf'], best['cls__min_samples_leaf']+1],
    'cls__max_features': [best['cls__max_features']],
    'cls__bootstrap': [best['cls__bootstrap']]
}
rf_gs = grid_search(rf_pipeline, rf_param_grid)

=== PHASE 2: GRIDSEARCH ===
Fitting 10 folds for each of 81 candidates, totalling 810 fits
Best Grid Search parameters (Phase 2):
  cls__bootstrap: False
  cls__max_depth: 7
  cls__max_features: log2
  cls__min_samples_leaf: 5
  cls__min_samples_split: 2
  cls__n_estimators: 278
Best CV Score: 0.782


#### 4. Evaluation for random forest best model

In [13]:
# Evaluate best model
METRICS[MODELS[0]].update(**evaluate(rf_gs))
print_eval(METRICS[MODELS[0]])

==== PERFORMANCE ====
Baseline_CV: 0.764
Baseline_Test: 0.944
Best_CV: 0.782
Best_Test: 0.922
Best_Test_Accuracy: 0.918
Best_Test_Precision: 0.897
Best_Test_Recall: 0.929


# Support vector machine hypertuning

#### 1. Support vector machine Basline

In [14]:
# SVM Pipeline
svm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('cls', SVC(probability=True, random_state=42))  # probability=True for AUC
])

# Baseline support vector score
baseline_model(svm_pipeline, MODELS[1])

Baseline F2 score (CV): 0.7461
Baseline F2 score (Test): 0.9220


#### 2. SVM grid search (phase 1):
- Using **RandomizedSearchCV** with variety of parameters

In [15]:
# Random Search for Support Vector Machine Parameter Grid

# linear and rbf kernel
svm_param_dist = {
    "cls__C": np.logspace(-3, 3, 20),        # 0.001 → 1000
    "cls__gamma": np.logspace(-4, 1, 20),    # 0.0001 → 10
    "cls__kernel": ["linear", "rbf"],
    "cls__class_weight": [None, "balanced"]
    
}

# poly kernel
# svm_param_dist = {
#     "cls__kernel": ["poly"],
#     "cls__degree": [2, 3],
#     "cls__C": [0.1, 1, 10],
#     "cls__gamma": ["scale", "auto", 0.01, 0.1],
#     "cls__coef0": [0, 1]
# }

best = rand_search(svm_pipeline, svm_param_dist)

=== PHASE 1: RANDOM_GRIDSEARCH ===
Best Random Search parameters (Phase 1):
  cls__kernel: linear
  cls__gamma: 10.0
  cls__class_weight: balanced
  cls__C: 2.976351441631316
Best CV score:0.798


#### 3. SVM grid search (phase 2):
- Using **GridSearchCV** with more specific parametrs

In [16]:
# Random Search for Support Vector Machine Parameter Grid

# for linear kernel
svm_param_grid = {
    "cls__C": np.logspace(np.log10(best['cls__C']) - 0.5, np.log10(best['cls__C']) + 0.5, 10),
    "cls__kernel": [best['cls__kernel']],
    "cls__class_weight": [None, "balanced"]
}

svm_gs = grid_search(svm_pipeline, svm_param_grid)

=== PHASE 2: GRIDSEARCH ===
Fitting 10 folds for each of 20 candidates, totalling 200 fits
Best Grid Search parameters (Phase 2):
  cls__C: 0.9412049672680661
  cls__class_weight: balanced
  cls__kernel: linear
Best CV Score: 0.813


#### 4. Evaluation for Support Vector Machine best model

In [17]:
# Evaluate best model
METRICS[MODELS[1]].update(**evaluate(svm_gs))
print_eval(METRICS[MODELS[1]])

==== PERFORMANCE ====
Baseline_CV: 0.746
Baseline_Test: 0.922
Best_CV: 0.813
Best_Test: 0.909
Best_Test_Accuracy: 0.885
Best_Test_Precision: 0.839
Best_Test_Recall: 0.929


## Logistic regression hypertuning

#### 1. Logistic regression Basline

In [18]:
# Logistic Regression Pipeline
lr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('cls', LogisticRegression(random_state=42, max_iter=1000))
])

# Baseline Logistic Regression
baseline_model(lr_pipeline, MODELS[2])

Baseline F2 score (CV): 0.7865
Baseline F2 score (Test): 0.9091


#### 2. Logistic Regression grid search (phase 1):
- Using **RandomizedSearchCV** with variety of parameters

In [19]:
# Random Search for Logistic Regression Parameter Grid

lr_param_dist = {
    "cls__penalty": ["l1", "l2", "elasticnet"],
    "cls__C": np.logspace(-3, 3, 50),    # 0.001 → 1000
    "cls__solver": ["liblinear", "saga"],  # supports l1 & elasticnet
    "cls__class_weight": [None, "balanced", [{0: 1, 1: W} for W in [1.5, 3, 5, 10]]],
    "cls__l1_ratio": np.linspace(0, 1, 5)  # only matters if elasticnet
}

best = rand_search(lr_pipeline, lr_param_dist)

=== PHASE 1: RANDOM_GRIDSEARCH ===
Best Random Search parameters (Phase 1):
  cls__solver: liblinear
  cls__penalty: l2
  cls__l1_ratio: 0.75
  cls__class_weight: balanced
  cls__C: 0.0071968567300115215
Best CV score:0.825


#### 3. Logistic Regression grid search (phase 2):
- Using **GridSearchCV** with more specific parametrs

In [20]:
# Grid Search for Logistic Regression Parameter Grid
lr_param_grid = {
    "cls__C": np.logspace(-4, -1, 10),   # narrower range
    "cls__penalty": [best["cls__penalty"]],
    "cls__solver": [best["cls__solver"]],
    "cls__class_weight": [best["cls__class_weight"]]
}

lr_gs = grid_search(lr_pipeline, lr_param_grid)

=== PHASE 2: GRIDSEARCH ===
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best Grid Search parameters (Phase 2):
  cls__C: 0.00046415888336127773
  cls__class_weight: balanced
  cls__penalty: l2
  cls__solver: liblinear
Best CV Score: 0.858


### 4. Evaluation for Logistic Regression best model

In [21]:
# Evaluate best model
METRICS[MODELS[2]].update(**evaluate(lr_gs))
print_eval(METRICS[MODELS[2]])

==== PERFORMANCE ====
Baseline_CV: 0.787
Baseline_Test: 0.909
Best_CV: 0.858
Best_Test: 0.886
Best_Test_Accuracy: 0.705
Best_Test_Precision: 0.609
Best_Test_Recall: 1.000


# Comapring Results

In [22]:
# Display results
print(f"\n{'='*80}")
print("FINAL RESULTS COMPARISON")
print(f"{'='*80}")

models = [
    "Logistic Regression",
    "Random Forest",
    "SVM",
]
all_grids = {
        'Random Forest': rf_gs,
        'SVM': svm_gs,
        'Logistic Regression': lr_gs
}
# Save all Results in a data frame
results = []
for name in MODELS:
    results.append({
        "Model": name,
        **METRICS[name],
    })
df_results = pd.DataFrame(results).sort_values(by='Best_Test', ascending=False)
cols_order = ['Best_Params', 'Model', 'Baseline_CV', 'Best_CV', 'Baseline_Test', 'Best_Test',
             'Best_Test_Accuracy', 'Best_Test_Precision', 'Best_Test_Recall']
df_results = df_results[cols_order]

# Print Results
print(df_results.drop(columns='Best_Params'))

# Display best parameters for each model
print(f"\n{'='*80}")
print("BEST PARAMETERS FOR EACH MODEL")
print(f"{'='*80}")

for i in results:
    print(f"\n{i['Model']}:")
    for param, value in i['Best_Params'].items():
        print(f"  {param}: {value}")



FINAL RESULTS COMPARISON
                 Model  Baseline_CV   Best_CV  Baseline_Test  Best_Test  \
0        Random Forest     0.763605  0.782450       0.944056   0.921986   
1                  SVM     0.746063  0.812989       0.921986   0.909091   
2  Logistic Regression     0.786503  0.858348       0.909091   0.886076   

   Best_Test_Accuracy  Best_Test_Precision  Best_Test_Recall  
0            0.918033             0.896552          0.928571  
1            0.885246             0.838710          0.928571  
2            0.704918             0.608696          1.000000  

BEST PARAMETERS FOR EACH MODEL

Random Forest:
  cls__bootstrap: False
  cls__max_depth: 7
  cls__max_features: log2
  cls__min_samples_leaf: 5
  cls__min_samples_split: 2
  cls__n_estimators: 278

SVM:
  cls__C: 0.9412049672680661
  cls__class_weight: balanced
  cls__kernel: linear

Logistic Regression:
  cls__C: 0.00046415888336127773
  cls__class_weight: balanced
  cls__penalty: l2
  cls__solver: liblinear


- While the **Logistic Regression** achieved a perfect $1.0$ Recall (it identified all positive cases), its Precision is very low ($0.608696$). This low Precision means it had many false positives, which significantly dragged down its overall $F_2$ score to $0.886076$.
- The **Random Forest** achieved a high Recall ($0.928571$) while maintaining a strong Precision ($0.896552$), leading to the superior final $F_2$ score
- So we can say that **Random Forest** is the most effective model.

In [23]:
import joblib
joblib.dump(rf_gs.best_estimator_, "../models/final_model.pkl")

['../models/final_model.pkl']