# 05 - Hyperparameter Tuning

This notebook performs hyperparameter optimization for multiple machine learning models using the selected subset of features.

The goals of this notebook are:

- compare baseline (non-tuned) models  
- search for optimal hyperparameters  
- evaluate model generalization on the test set  
- select the final model to be saved and deployed  

In [2]:
import sys
sys.path.append("..")

import pandas as pd
import numpy as np

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.pipeline import Pipeline

import joblib

from src.config import RANDOM_STATE, SELECTED_FEATURES

## 1. Dataset Loading

We load the preprocessed training and test sets as well as the final selected feature subset.


In [3]:
X_train = pd.read_csv("../data/processed/X_train_preprocessed.csv")
X_test  = pd.read_csv("../data/processed/X_test_preprocessed.csv")
y_train = pd.read_csv("../data/processed/y_train.csv").squeeze()
y_test  = pd.read_csv("../data/processed/y_test.csv").squeeze()

X_train_sel = X_train[SELECTED_FEATURES]
X_test_sel  = X_test[SELECTED_FEATURES]

X_train_sel.head()

Unnamed: 0,concave_points_mean,concavity_worst,symmetry_worst,radius_avg,perimeter_avg,area_avg,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,symmetry_mean,fractal_dimension_mean
0,-0.398175,-0.610227,0.054566,0.32968,0.227148,0.271332,0.518559,0.891826,0.424632,0.383925,-0.974744,-0.689772,-0.688586,-1.039155,-0.825056
1,-0.303075,-0.712666,-0.137576,-0.567734,-0.579571,-0.541143,-0.516364,-1.63971,-0.541349,-0.542961,0.476219,-0.631834,-0.604281,0.521543,-0.454523
2,-0.765459,-0.431313,-0.675893,-0.154254,-0.344409,-0.274852,-0.368118,0.455515,-0.38825,-0.40297,-1.432979,-0.383927,-0.342175,-0.850857,-0.226171
3,1.213233,2.958619,-0.075646,-0.113585,0.391845,-0.127485,0.205285,0.726168,0.40033,0.070612,0.243253,2.203585,2.256094,0.818474,0.899791
4,0.713767,0.327775,-0.909322,1.259901,1.143881,1.056292,1.243005,0.194195,1.210377,1.206652,-0.111442,0.051348,0.732962,-0.427187,-0.822184


## 2. Baseline Model Performance

Before tuning any hyperparameters, we evaluate each model with default settings.  
This provides a reference (baseline) for comparing improvements obtained through optimization.


In [6]:
baseline_models = {
    "Logistic Regression": LogisticRegression(max_iter=500, random_state=RANDOM_STATE),
    "Random Forest": RandomForestClassifier(random_state=RANDOM_STATE),
    "Gradient Boosting": GradientBoostingClassifier(random_state=RANDOM_STATE),
    "SVC (RBF Kernel)": SVC(probability=True, random_state=RANDOM_STATE)
}

def evaluate(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    prob = model.predict_proba(X_test)[:, 1]

    return [
        accuracy_score(y_test, pred),
        precision_score(y_test, pred),
        recall_score(y_test, pred),
        f1_score(y_test, pred),
        roc_auc_score(y_test, prob),
    ]

# Build dataframe
results_baseline = pd.DataFrame(
    [
        evaluate(model, X_train_sel, X_test_sel, y_train, y_test)
        for model in baseline_models.values()
    ],
    index=baseline_models.keys(),
    columns=["Accuracy", "Precision", "Recall", "F1-score", "ROC AUC"]
)

results_baseline

Unnamed: 0,Accuracy,Precision,Recall,F1-score,ROC AUC
Logistic Regression,0.964912,0.952381,0.952381,0.952381,0.992725
Random Forest,0.938596,0.972973,0.857143,0.911392,0.992725
Gradient Boosting,0.964912,1.0,0.904762,0.95,0.99504
SVC (RBF Kernel),0.95614,0.974359,0.904762,0.938272,0.985119


## 3. Hyperparameter Search Spaces

We define search grids for each algorithm.  
The search spaces are intentionally moderate in size to balance performance and computational cost.


In [7]:
param_spaces = {
    "Logistic Regression": {
        "C": [0.001, 0.01, 0.1, 1, 10, 100],
        "penalty": ["l2"],
        "solver": ["lbfgs"]
    },

    "Random Forest": {
        "n_estimators": [50, 100, 200, 300],
        "max_depth": [None, 5, 8, 12],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
    },

    "Gradient Boosting": {
        "n_estimators": [50, 100, 200],
        "learning_rate": [0.01, 0.05, 0.1, 0.2],
        "max_depth": [2, 3, 4],
        "subsample": [0.8, 1.0]
    },

    "SVC (RBF Kernel)": {
        "C": [0.1, 1, 10, 100],
        "gamma": ["scale", "auto"],
        "kernel": ["rbf"]
    }
}

## 4. RandomizedSearchCV Optimization

We perform RandomizedSearchCV with 5-fold cross-validation to efficiently explore the hyperparameter space.

The best estimator for each model is stored for later evaluation.


In [8]:
tuned_models = {}

for name, model in baseline_models.items():
    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_spaces[name],
        n_iter=20,
        cv=5,
        scoring="f1",
        n_jobs=-1,
        random_state=RANDOM_STATE
    )

    search.fit(X_train_sel, y_train)
    tuned_models[name] = search.best_estimator_




In [None]:
results_tuned = pd.DataFrame(
    [
        evaluate(model, X_train_sel, X_test_sel, y_train, y_test)
        for model in tuned_models.values()
    ],
    index=tuned_models.keys(),
    columns=["Accuracy", "Precision", "Recall", "F1-score", "ROC AUC"]
)

results_tuned

In [None]:
results_tuned = pd.DataFrame(
    [
        evaluate(model, X_train_sel, X_test_sel, y_train, y_test)
        for model in tuned_models.values()
    ],
    index=tuned_models.keys(),
    columns=["Accuracy", "Precision", "Recall", "F1-score", "ROC AUC"]
)

results_tuned