# **6. Hyperparameter tuning**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

df_reduced = pd.read_csv("../data/heart_reduced.csv")
X_reduced = df_reduced.drop(columns='num')
y = df_reduced['num'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, stratify=y, random_state=42)


In [2]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
sco = {"accuracy": "accuracy", "f1": "f1", "recall": "recall", "precision": "precision"}


def summarize_best(search, name):
    print(f"{name} best params:", search.best_params_)
    print(f"{name} best CV AUC: {search.best_score_:.4f}")

**5.1 Logistic Regression (GridSearchCV)**

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.pipeline import Pipeline

log_pipe = Pipeline(steps=[
    ("fs", SelectKBest(mutual_info_classif, k="all")),
    ("clf", LogisticRegression(max_iter=1000, solver="liblinear", class_weight="balanced"))
])

log_grid = {
    "fs__k": [min(10, X_reduced.shape[1]), "all"],
    "clf__penalty": ["l1", "l2"],
    "clf__C": [0.1, 1, 5, 10]
}

log_search = GridSearchCV(log_pipe, param_grid=log_grid, scoring=sco, refit='f1', cv=cv, n_jobs=-1)
log_search.fit(X_train, y_train)
summarize_best(log_search, "LogReg")


LogReg best params: {'clf__C': 0.1, 'clf__penalty': 'l1', 'fs__k': 9}
LogReg best CV AUC: nan




**5.2 Decision tree (GridSearchCV)**

In [4]:
from sklearn.tree import DecisionTreeClassifier

dt_grid = {
    "max_depth": [None, 3, 5, 7, 10],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 4, 8],
    "criterion": ["gini", "entropy"]
}

dt_search = GridSearchCV(DecisionTreeClassifier(class_weight="balanced", random_state=42),
                         param_grid=dt_grid, scoring=sco, refit="f1", cv=cv, n_jobs=-1)
dt_search.fit(X_train, y_train)
summarize_best(dt_search, "DecisionTree")

DecisionTree best params: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
DecisionTree best CV AUC: nan


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


**5.3 Random forest (RandomizedSearchCV)**

In [5]:
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint

rf_dist = {
    "n_estimators": randint(300, 800),
    "max_depth": [None] + list(range(4, 18, 4)),
    "min_samples_split": randint(2, 16),
    "min_samples_leaf": randint(1, 10),
    "max_features": ["sqrt", "log2", None]
}

rf_search = RandomizedSearchCV(
    RandomForestClassifier(class_weight="balanced_subsample", random_state=42),
    param_distributions=rf_dist,
    n_iter=50, scoring=sco, refit= "f1", cv=cv, n_jobs=-1, random_state=42
)
rf_search.fit(X_train, y_train)
summarize_best(rf_search, "RandomForest")


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


RandomForest best params: {'max_depth': 12, 'max_features': 'sqrt', 'min_samples_leaf': 8, 'min_samples_split': 14, 'n_estimators': 320}
RandomForest best CV AUC: nan


**5.4 SVM**

In [6]:
from sklearn.svm import SVC
from sklearn.decomposition import PCA

svm_pipe = Pipeline(steps=[
    ("pca", PCA(n_components=0.95)),
    ("clf", SVC(probability=True, class_weight="balanced", random_state=42))
])

svm_grid = {
    "pca": [PCA(n_components=0.95), "passthrough"],
    "clf__kernel": ["rbf", "linear"],
    "clf__C": [0.5, 1, 5, 10],
    "clf__gamma": ["scale", "auto"]
}

svm_search = GridSearchCV(svm_pipe, param_grid=svm_grid, scoring=sco, refit="f1", cv=cv, n_jobs=-1)
svm_search.fit(X_train, y_train)
summarize_best(svm_search, "SVM")

SVM best params: {'clf__C': 0.5, 'clf__gamma': 'scale', 'clf__kernel': 'rbf', 'pca': PCA(n_components=0.95)}
SVM best CV AUC: nan


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


**Compare optimized models on the test set**

In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

best_models = {
    "LogReg": log_search.best_estimator_,
    "DecisionTree": dt_search.best_estimator_,
    "RandomForest": rf_search.best_estimator_,
    "SVM": svm_search.best_estimator_
}

tuned_results = []
plt.figure(figsize=(8,6))

for name, model in best_models.items():
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

    tuned_results.append([name, acc, prec, rec, f1]) 


tuned_df = pd.DataFrame(tuned_results, columns=["Model","Accuracy","Precision","Recall","F1"]).set_index("Model") # Updated columns
print(tuned_df)

              Accuracy  Precision    Recall        F1
Model                                                
LogReg        0.783333   0.827917  0.783333  0.804195
DecisionTree  0.833333   0.808642  0.833333  0.815629
RandomForest  0.733333   0.833398  0.733333  0.777250
SVM           0.700000   0.874231  0.700000  0.763171


<Figure size 800x600 with 0 Axes>

# Persist best model

In [8]:
import joblib

best_name = tuned_df["F1"].idxmax()
final_model = log_search.best_estimator_
print("Best tuned model:", best_name)

joblib.dump(final_model, "../models/final_pipeline.pkl")

print("Best model saved to models/final_pipeline.pkl")

Best tuned model: DecisionTree
Best model saved to models/final_pipeline.pkl


In [9]:
print(final_model)

Pipeline(steps=[('fs',
                 SelectKBest(k=9,
                             score_func=<function mutual_info_classif at 0x00000145B5381DA0>)),
                ('clf',
                 LogisticRegression(C=0.1, class_weight='balanced',
                                    max_iter=1000, penalty='l1',
                                    solver='liblinear'))])


In [11]:
import numpy as np
import joblib

model = joblib.load("../models/final_pipeline.pkl")

input_array = np.array([[1, 2.3, 1, 0, 150, 0, 2, 2, 3]])

prediction = model.predict(input_array)[0]
proba = model.predict_proba(input_array)[0] 
print("Prediction:", prediction)     
print("Probabilities:", proba)     


Prediction: 2
Probabilities: [0.17073364 0.41394175 0.41532461]




In [14]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix


y_pred = model.predict(X_test)

# Overall metrics
accuracy = accuracy_score(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average="macro")
weighted_f1 = f1_score(y_test, y_pred, average="weighted")

print("Accuracy:", round(accuracy, 2))
print("Macro F1-score:", round(macro_f1, 2))
print("Weighted F1-score:", round(weighted_f1, 2))

# Per-class metrics
report = classification_report(y_test, y_pred, target_names=[
    "Class 0 (No disease)",
    "Class 1 (Low risk)",
    "Class 2 (Moderate risk)"
])
print("\nClassification Report:\n", report)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", cm)


Accuracy: 0.78
Macro F1-score: 0.42
Weighted F1-score: 0.8

Classification Report:
                          precision    recall  f1-score   support

   Class 0 (No disease)       0.94      0.90      0.92        50
     Class 1 (Low risk)       0.40      0.29      0.33         7
Class 2 (Moderate risk)       0.00      0.00      0.00         3

               accuracy                           0.78        60
              macro avg       0.45      0.40      0.42        60
           weighted avg       0.83      0.78      0.80        60


Confusion Matrix:
 [[45  2  3]
 [ 1  2  4]
 [ 2  1  0]]
