In [4]:
# notebooks/03_training.ipynb
import sys
import pandas as pd
sys.path.append('../')
pd.set_option('display.max_columns',None)
from sklearn.base import clone  # <-- ADD THIS IMPORT
from itertools import product
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from src.config import SCALERS, MODELS, CV_METHODS, HYPERPARAMETERS
from src.data_loader import load_train_dataset
import os
# from src.utils import save_results

In [5]:
# Load preprocessed data (define X, y)
X, y = load_train_dataset(purpose='modelling', split=True)
results = []

In [6]:
from joblib import Parallel, delayed, dump
MODEL_SAVE_DIR = '../experiments/models-tunned'
def run_experiment(model_name, model, scaler_name, scaler, cv_name, cv):
    """Wrapper function for parallel execution of experiments."""

    model_clone = clone(model)    
    pipeline = Pipeline([("scaler", scaler), ("model", model_clone)])

    param_grid = HYPERPARAMETERS[model_name]
    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring="accuracy", n_jobs=1)
    grid_search.fit(X, y)
    
    # Save the best model
    best_model = grid_search.best_estimator_
    filename = f"{model_name}_{scaler_name}_{cv_name}_best.joblib".replace(" ", "_")
    model_path = os.path.join(MODEL_SAVE_DIR, filename)
    dump(best_model, model_path)
    
    return {
        "model": model_name,
        "scaler": scaler_name,
        "cv": cv_name,
        "best_params": grid_search.best_params_,
        "best_score": grid_search.best_score_,
        "model_path": model_path
    }

# Generate all combinations of models/scalers/CV methods
experiments = product(MODELS.items(), SCALERS.items(), CV_METHODS.items())

# Parallel execution (n_jobs=-1 uses all CPU cores)
results = Parallel(n_jobs=-1, verbose=10)(
    delayed(run_experiment)(model_name, model, scaler_name, scaler, cv_name, cv)
    for (model_name, model), (scaler_name, scaler), (cv_name, cv) in experiments
)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


NameError: name 'GridSearchCV' is not defined

In [None]:
# Save results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv("../experiments/results.csv", index=False)