In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pickle as pkl

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from scipy.stats import uniform

# Load data

In [5]:
df = pd.read_csv("../data/heart_disease.csv")
X = df.drop("num", axis=1)
y = df["num"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

# GridSearchCV

In [20]:
# Logistic Regression
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['lbfgs', 'newton-cg', 'sag', 'saga']
}
grid_lr = GridSearchCV(LogisticRegression(random_state=30), param_grid_lr, scoring='accuracy')
grid_lr.fit(X_train, y_train)
print("Best Logistic Regression:", grid_lr.best_params_)
print("Cross-Validated Best Score (Logistic Regression):", grid_lr.best_score_)


Best Logistic Regression: {'C': 10, 'solver': 'lbfgs'}
Cross-Validated Best Score (Logistic Regression): 0.59515306122449


In [None]:
# Decision Tree
param_grid_dt = {
    'max_depth': [2, 3, 5, 7, 10],
    'min_samples_split': [2, 5, 10]
}
grid_dt = GridSearchCV(DecisionTreeClassifier(random_state=30), param_grid_dt, scoring='accuracy')
grid_dt.fit(X_train, y_train)
print("Best Decision Tree:", grid_dt.best_params_)
print("Cross-Validated Best Score (Logistic Regression):", grid_dt.best_score_)


Cross-Validated Best Score (Logistic Regression): 0.533078231292517
Best Decision Tree: {'max_depth': 2, 'min_samples_split': 2}


In [None]:
# Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10]
}
grid_rf = GridSearchCV(RandomForestClassifier(random_state=30), param_grid_rf, scoring='accuracy')
grid_rf.fit(X_train, y_train)
print("Best Random Forest:", grid_rf.best_params_)
print("Cross-Validated Best Score (Random Forest):", grid_rf.best_score_)

Cross-Validated Best Score (Random Forest): 0.6075680272108843
Best Random Forest: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}


In [None]:
# SVM
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}
grid_svm = GridSearchCV(SVC(probability=True, random_state=30), param_grid_svm, scoring='accuracy')
grid_svm.fit(X_train, y_train)
print("Best SVM:", grid_svm.best_params_)
print("Cross-Validated Best Score (SVM):", grid_svm.best_score_)

Cross-Validated Best Score (SVM): 0.599234693877551
Best SVM: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}


# RandomizedSearchCV

In [27]:
# Logistic Regression
param_dist_lr = {
    'C': uniform(0.001, 10),
    'solver': ['lbfgs', 'newton-cg', 'sag', 'saga']
}
rand_lr = RandomizedSearchCV(LogisticRegression(random_state=30), param_dist_lr, n_iter=10, scoring='accuracy', random_state=30)
rand_lr.fit(X_train, y_train)
print("Best Logistic Regression (Randomized):", rand_lr.best_params_)
print("Best Score (Logistic Regression):", rand_lr.best_score_)

Best Logistic Regression (Randomized): {'C': np.float64(1.6375072610275332), 'solver': 'newton-cg'}
Best Score (Logistic Regression): 0.5952380952380952


In [28]:
# Decision Tree
param_dist_dt = {
    'max_depth': [2, 3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10, 15, 20]
}
rand_dt = RandomizedSearchCV(DecisionTreeClassifier(random_state=30), param_dist_dt, n_iter=10, scoring='accuracy', random_state=30)
rand_dt.fit(X_train, y_train)
print("Best Decision Tree (Randomized):", rand_dt.best_params_)
print("Best Score (Decision Tree):", rand_dt.best_score_)

Best Decision Tree (Randomized): {'min_samples_split': 20, 'max_depth': 10}
Best Score (Decision Tree): 0.5333333333333333


In [39]:
# Random Forest
param_dist_rf = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10, 15]
}
rand_rf = RandomizedSearchCV(RandomForestClassifier(random_state=30), param_dist_rf, n_iter=20, scoring='accuracy', random_state=30)
rand_rf.fit(X_train, y_train)
print("Best Random Forest (Randomized):", rand_rf.best_params_)
print("Best Score (Random Forest):", rand_rf.best_score_)

Best Random Forest (Randomized): {'n_estimators': 200, 'min_samples_split': 5, 'max_depth': None}
Best Score (Random Forest): 0.5995748299319728


In [38]:
# SVM
param_dist_svm = {
    'C': uniform(0.01, 50),
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}
rand_svm = RandomizedSearchCV(SVC(probability=True, random_state=30), param_dist_svm, n_iter=20, scoring='accuracy', random_state=30)
rand_svm.fit(X_train, y_train)
print("Best SVM (Randomized):", rand_svm.best_params_)
print("Best Score (SVM):", rand_svm.best_score_)

Best SVM (Randomized): {'C': np.float64(1.341218238655718), 'gamma': 'auto', 'kernel': 'rbf'}
Best Score (SVM): 0.599234693877551


# Load baseline models

In [4]:
with open("../models/logistic_regression_model.pkl", "rb") as file:
    log_reg_model = pkl.load(file)

with open("../models/decision_tree_model.pkl", "rb") as file:
    dt_model = pkl.load(file)

with open("../models/random_forest_model.pkl", "rb") as file:
    rf_model = pkl.load(file)

with open("../models/svm_model.pkl", "rb") as file:
    svm_model = pkl.load(file)

# Compare models

In [None]:
# models[model_name][i]
# i = 0: baseline, 1: grid search, 2: randomized search

models = {
    "Logistic Regression": [log_reg_model, grid_lr.best_estimator_, rand_lr.best_estimator_],
    "Decision Tree": [dt_model, grid_dt.best_estimator_, rand_dt.best_estimator_],
    "Random Forest": [rf_model, grid_rf.best_estimator_, rand_rf.best_estimator_],
    "SVM": [svm_model, grid_svm.best_estimator_, rand_svm.best_estimator_]
}
