### Model analysis

In [2]:
# Import libraries
import os
import sys
from pathlib import Path
import joblib
import warnings
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

os.environ['DYLD_LIBRARY_PATH'] = '/opt/homebrew/opt/libomp/lib'
import lightgbm as lgb

# Set the root directory
PROJECT_ROOT = Path.cwd().parent  

# Add the 'src' directory to the Python module search path
sys.path.append(str(PROJECT_ROOT / "src"))
sys.path.append(os.path.abspath("../src"))

# Now that the src folder is in sys.path, you can import
from functions import load_data, nrCV, generate_median_CI_table, winner_model_selection


1. Load the cleaned dataset

In [None]:
# Load processed dataset
X, y = load_data("../data/breast_cancer_final.csv")

2. Define the models thar will be evaluated using nrCV

In [None]:
# Define the models to be evaluated
estimators = [
    ("LogisticRegression", LogisticRegression(max_iter=10000, solver='saga')),
    ("GaussianNB", GaussianNB()),
    ("LDA", LinearDiscriminantAnalysis()),
    ("SVM", SVC(probability=True)),
    ("RandomForest", RandomForestClassifier(random_state=42)),
    ("LightGBM", lgb.LGBMClassifier(verbose=-1))  
]

3. Define the hyperparameters of each models for fine tuning inside the nrCV inner loop using grid search

In [None]:
# Define the hyperparameters for each model
hyperparameters = {
    "LogisticRegression": {
        "penalty": ["elasticnet"],
        "l1_ratio": [0.1, 0.5, 0.9],
        "C": [0.01, 0.1, 1, 10],
    },
    "GaussianNB": {
        "var_smoothing": [1e-09, 1e-08, 1e-07]
    },
    "LDA": {
        "solver": ["lsqr", "eigen"], 
        "shrinkage": ["auto", None]
    },
    "SVM": {
        "C": [0.1, 1, 10],
        "gamma": ["scale", "auto"],
        "kernel": ["rbf", "poly"],
    },
    "RandomForest": {
        "n_estimators": [100, 200],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5],
    },
    "LightGBM": {
        "num_leaves": [31, 50],
        "learning_rate": [0.01, 0.1],
        "n_estimators": [100, 200],
    }
}


4. Perform repeated nested cross-validation using the `nrCV` class from functions.py
    
    * Filter out any warnings using the `warnings` module

    * Run the nrCV class under the variable `model_runner`
    
    * Train the models and print the summary statistics for generalization assessment
    
    * Perform bootstrap resampling on the models' summary statistics using the function `generate_median_CI_table` from functions.py
    
    * Save the bootstap resampling results in a CSV file

In [None]:
# Suppress warnings
warnings.filterwarnings('ignore')

# Run nested cross-validation with the nrCV class
# The 'rounds' parameter is the number of rounds, 'N' is the number of outer folds,
# and 'K' is the number of inner folds.
model_runner = nrCV(
    dataset=(X,y),
    estimators=estimators,
    hyperparameters=hyperparameters,
    rounds=10,  
    N=5,       
    K=3,       
    inner_metric="f1_macro", 
    scoring_strategy="macro"
)

model_runner.train()

model_runner.summary()

# Perform bootstrapping to generate confidence intervals for the median performance
df_median_ci = generate_median_CI_table(model_runner)
display(df_median_ci)

# Save the results to a CSV file in the results directory
df_median_ci.to_csv("../results/median_CI_results.csv", index=False)


Model: LogisticRegression
  MCC: Mean = 0.9393 | Std = 0.0290
  Balanced Accuracy: Mean = 0.9653 | Std = 0.0177
  F1: Mean = 0.9690 | Std = 0.0151
  F2: Mean = 0.9666 | Std = 0.0167
  Recall: Mean = 0.9653 | Std = 0.0177
  Precision: Mean = 0.9742 | Std = 0.0127
  Specificity: Mean = 0.9896 | Std = 0.0115
  NPV: Mean = 0.9658 | Std = 0.0208
  AUC: Mean = 0.9933 | Std = 0.0084
  PR-AUC: Mean = 0.9920 | Std = 0.0090
--------------------------------------------------
Model: GaussianNB
  MCC: Mean = 0.8537 | Std = 0.0487
  Balanced Accuracy: Mean = 0.9244 | Std = 0.0255
  F1: Mean = 0.9263 | Std = 0.0244
  F2: Mean = 0.9250 | Std = 0.0250
  Recall: Mean = 0.9244 | Std = 0.0255
  Precision: Mean = 0.9294 | Std = 0.0243
  Specificity: Mean = 0.9539 | Std = 0.0227
  NPV: Mean = 0.9384 | Std = 0.0255
  AUC: Mean = 0.9845 | Std = 0.0089
  PR-AUC: Mean = 0.9742 | Std = 0.0228
--------------------------------------------------
Model: LDA
  MCC: Mean = 0.8985 | Std = 0.0356
  Balanced Accuracy: Me

Unnamed: 0,Model,Metric,Median,95% CI Lower,95% CI Upper
0,LogisticRegression,MCC,0.94045,0.937991,0.955969
1,LogisticRegression,Balanced Accuracy,0.969144,0.965281,0.971457
2,LogisticRegression,F1,0.969951,0.968447,0.97749
3,LogisticRegression,F2,0.96986,0.966341,0.973955
4,LogisticRegression,Recall,0.969144,0.965281,0.971457
5,LogisticRegression,Precision,0.97651,0.971528,0.979031
6,LogisticRegression,Specificity,0.992958,0.984375,1.0
7,LogisticRegression,NPV,0.968502,0.966667,0.971429
8,LogisticRegression,AUC,0.995739,0.994536,0.997635
9,LogisticRegression,PR-AUC,0.994565,0.992063,0.997037


5. Best model selection

    * Select the best model out of the six, using the function `winner_model_selection`, that takes as input the CSV file with the bootstrap 
      resampling results

    * Print the winner model  

In [None]:
# Select the best model based on the median performance metrics 
winner = winner_model_selection('../results/median_CI_results.csv', primary_metric='MCC', secondary_metric='AUC')
print(f"\n Final Winner Model: {winner}")

Checking MCC first...
Top 1: LogisticRegression - Median: 0.9404499623701952, 95% CI: [0.937991108087845, 0.9559686547893308]
Top 2: SVM - Median: 0.9374694791470836, 95% CI: [0.9220658072947296, 0.9466775951900822]

 MCC CIs overlap — checking secondary metric (AUC)...

AUC comparison:
LogisticRegression - Median AUC: 0.9957387139384052
SVM - Median AUC: 0.994593098876032

 Winner: LogisticRegression based on AUC (better separability)

 Final Winner Model: LogisticRegression


6. Train the winner model in the dataset using the best hyperparameters

    * Filter out any warnings using the `warnings` module

    * Load the cleaned dataset again

    * Define the preprocessing `Pipeline` for the dataset:
        - *imputing*
        - *sacling*
        - *classifier definition*

    * Define the hyperparameters of the winner model in the `param_grid` dictionary 

    * Perform grid search using `GridSearchCV` to tune the hyperparameters under the variable `grid_search` variable

    * Filter out any warnings during the process of grid search

    * Print the best hyperparameters set

    * Save the best hyperparameters in the `best_pipeline` variable using the `.best_estimator_` method from sklearn 

    * Train the model using these best hyperparameters

    * Save the model that is trained in the whole dataset with the best hyperparameters

In [None]:
# Suppress warnings for the final model training
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore', category=ConvergenceWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
np.seterr(all='ignore')

# Load the cleaned dataset again
(X, y) = load_data("../data/breast_cancer_final.csv")

# Define the preprocessing pipeline for the final model
pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("classifier", LogisticRegression(max_iter=10000, solver='saga', random_state=42))
])

# Define the hyperparameter grid for the final model
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__l1_ratio': [0.0, 0.25, 0.5, 0.75, 1.0],
    'classifier__penalty': ['elasticnet']
}

# Perform grid search for hyperparameter tuning
grid_search = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring='balanced_accuracy',
    cv=5,
    n_jobs=1,
    verbose=2
)

# Suppress warnings during grid search
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    warnings.simplefilter('ignore', ConvergenceWarning)
    grid_search.fit(X, y)

print("Best Hyperparameters:", grid_search.best_params_)

# Save the best hyperparameters
best_pipeline = grid_search.best_estimator_

# Train the model with the best hyperparameters
best_pipeline.fit(X, y)

# Save the best model to a models directory
joblib.dump(best_pipeline, '../models/winner_model.pkl')

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] END classifier__C=0.01, classifier__l1_ratio=0.0, classifier__penalty=elasticnet; total time=   0.0s
[CV] END classifier__C=0.01, classifier__l1_ratio=0.0, classifier__penalty=elasticnet; total time=   0.0s
[CV] END classifier__C=0.01, classifier__l1_ratio=0.0, classifier__penalty=elasticnet; total time=   0.0s
[CV] END classifier__C=0.01, classifier__l1_ratio=0.0, classifier__penalty=elasticnet; total time=   0.0s
[CV] END classifier__C=0.01, classifier__l1_ratio=0.0, classifier__penalty=elasticnet; total time=   0.0s
[CV] END classifier__C=0.01, classifier__l1_ratio=0.25, classifier__penalty=elasticnet; total time=   0.0s
[CV] END classifier__C=0.01, classifier__l1_ratio=0.25, classifier__penalty=elasticnet; total time=   0.0s
[CV] END classifier__C=0.01, classifier__l1_ratio=0.25, classifier__penalty=elasticnet; total time=   0.0s
[CV] END classifier__C=0.01, classifier__l1_ratio=0.25, classifier__penalty=elasticnet;

['../models/winner_model.pkl']