In [1]:
import sys
import pandas as pd
sys.path.append('../')
pd.set_option('display.max_columns',None)
from sklearn.base import clone  # <-- ADD THIS IMPORT
from itertools import product
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from src.config import SCALERS, MODELS, CV_METHODS, HYPERPARAMETERS
from src.data_loader import load_train_dataset
from sklearn.model_selection import GridSearchCV
import os
from joblib import Parallel, delayed, dump

In [2]:
MODEL_SAVE_DIR = '../experiments/models-tunned'
X, y = load_train_dataset(purpose='modelling', split=True)
results = []

In [None]:
def run_experiment(model_name, model, scaler_name, scaler, cv_name, cv):
    """Wrapper function for parallel execution of experiments."""

    model_clone = clone(model)    
    pipeline = Pipeline([("scaler", scaler), ("model", model_clone)])

    param_grid = HYPERPARAMETERS[model_name]
    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring="accuracy", n_jobs=1)
    grid_search.fit(X, y)
    
    best_model = grid_search.best_estimator_
    filename = f"{model_name}_{scaler_name}_{cv_name}_best.joblib".replace(" ", "_")
    model_path = os.path.join(MODEL_SAVE_DIR, filename)
    dump(best_model, model_path)
    
    return {
        "model": model_name,
        "scaler": scaler_name,
        "cv": cv_name,
        "best_params": grid_search.best_params_,
        "best_score": grid_search.best_score_,
        "model_path": model_path
    }

In [3]:
experiments = product(MODELS.items(), SCALERS.items(), CV_METHODS.items())

results = Parallel(n_jobs=-1, verbose=10)(
    delayed(run_experiment)(model_name, model, scaler_name, scaler, cv_name, cv)
    for (model_name, model), (scaler_name, scaler), (cv_name, cv) in experiments
)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed: 14.1min
[Parallel(n_jobs=-1)]: Done  17 out of  36 | elapsed: 76.2min remaining: 85.1min
[Parallel(n_jobs=-1)]: Done  21 out of  36 | elapsed: 85.0min remaining: 60.7min
[Parallel(n_jobs=-1)]: Done  25 out of  36 | elapsed: 91.2min remaining: 40.1min
[Parallel(n_jobs=-1)]: Done  29 out of  36 | elapsed: 95.1min remaining: 23.0min
[Parallel(n_jobs=-1)]: Done  33 out of  36 | elapsed: 99.9min remaining:  9.1min
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed: 100.5min finished


In [4]:
results_df = pd.DataFrame(results)
results_df.to_csv("../experiments/results-tunned.csv", index=False)