# Hyperparameter Optimization

This notebook uses Optuna, a hyperparameter optimization framework, to optimize the hyperparameters.

In [1]:
# Load the autoreload extension to automatically reload modules when they are modified.
%load_ext autoreload

# Configure the autoreload extension to automatically reload imported modules.
%autoreload 2

# Add the path '../src' to the module search path.
import sys
sys.path.append('../src')

# Import the optuna library for hyperparameter optimization.
import optuna
import logging
from rich.logging import RichHandler

# Get the logger named "optuna" and add a RichHandler to it for colored logging.
optuna.logging.disable_default_handler()
logger = logging.getLogger("optuna")
if not logger.hasHandlers():
    logger.addHandler(RichHandler())


  from .autonotebook import tqdm as notebook_tqdm


###  Hyperparameter Optimization using Optuna

1. **`get_hyperparams` Function:** This function defines a search space for hyperparameters, including TF-IDF and SVM-related parameters. It returns a dictionary containing the hyperparameters to be optimized.

2. **`objective` Function:** This function defines the objective to be minimized during optimization. It creates a model pipeline with hyperparameters, scores the model on training data, and sets the mean scores as user attributes in the Optuna trial. The optimization objective is the minimum of the mean ROC AUC score.

3. **`optimize` Function:** This function creates an Optuna study, loads training data, and runs the optimization study using the defined objective function. The study is configured for parallel execution (`n_jobs`) and a specified number of trials (`n_trials`).

`Note`:  
A SQLite database is created to store the trial results, allowing the cell to be executed multiple times, resuming from the last trial. To start a new study, simply delete the existing database (./optuna.db).

In [3]:
# Import necessary libraries
import numpy as np
from sklearn.pipeline import Pipeline
from functools import partial

# Import custom helper functions
from helper import create_model, load_data, score_model

# Define constants for optimization
N_TRIALS = 40
N_JOBS = 5
SEED = 796856567


def get_hyperparams(trial: optuna.Trial) -> dict:
    # For TF-IDF
    max_ngram_word = trial.suggest_int("max_ngram_word", 1, 3)
    max_ngram_char = trial.suggest_int("max_ngram_char", 1, 5)
    use_idf = trial.suggest_categorical("use_idf", [True, False])
    lowercase = trial.suggest_categorical("lowercase", [True, False])

    # For SVM
    C = trial.suggest_float("C", 1e-7, 10, log=True)
    loss = trial.suggest_categorical("loss", ["hinge", "squared_hinge"])
    tol = trial.suggest_float("tol", 1e-5, 1e-1, log=True)

    calibration = trial.suggest_categorical("calibration", ["isotonic", "sigmoid"])

    # Define hyperparameters dictionary
    hyperparams = {
        "tfidf__word__ngram_range": (1, max_ngram_word),
        "tfidf__char__ngram_range": (1, max_ngram_char),
        "tfidf__word__lowercase": lowercase,
        "tfidf__char__lowercase": lowercase,
        "tfidf__word__use_idf": use_idf,
        "tfidf__char__use_idf": use_idf,
        "cls__method": calibration,
        "cls__estimator__C": C,
        "cls__estimator__loss": loss,
        "cls__estimator__tol": tol,
    }

    return hyperparams


def objective(X_train, y_train, trial: optuna.Trial):
    # Get hyperparameters from Optuna trial
    hyperparams = get_hyperparams(trial)
    
    # Create a model pipeline with the specified hyperparameters
    model: Pipeline = create_model(hyperparams)

    # Score the model on training data
    scores = score_model(model, X_train, y_train)

    # Calculate mean scores for each metric
    mean_scores = {}
    for metric, values in scores.items():
        mean_scores[metric] = values.mean()
    
    # Set the user attribute "scores" in the trial
    trial.set_user_attr("scores", mean_scores)
    
    # Use the ROC AUC score as the optimization objective
    f1_score = scores["test_roc_auc"]
    return min(np.mean(f1_score), np.median(f1_score))

def optimize(n_jobs=1, n_trials=3):
    # Create an Optuna study for optimization
    study = optuna.create_study(
        direction="maximize",
        sampler=optuna.samplers.TPESampler(seed=SEED),
        storage="sqlite:///optuna.db",
        study_name="optimize-model",
        load_if_exists=True,
    )

    # Load training data
    X_train, y_train = load_data("../data/train.parquet")
    
    # Partially apply the objective function with training data
    objective_with_data = partial(objective, X_train, y_train)

    # Run the optimization study
    study.optimize(
        objective_with_data, 
        n_trials=n_trials,
        n_jobs=n_jobs
    )
    
    return study

# Run the optimization with specified number of jobs and trials
study = optimize(N_JOBS, N_TRIALS)


### Results and Visualizations

This cell provides a comprehensive view of hyperparameter optimization results, making it easier to analyze and interpret the performance of different hyperparameter configurations.

In [4]:
# Import necessary libraries
import pandas as pd
import yaml

def display_study(study: optuna.study.Study) -> None:
    # Convert trials to a DataFrame
    df_trials: pd.DataFrame = study.trials_dataframe()
    df_trials = df_trials.dropna(ignore_index=True)

    # Extract scores from user_attrs and concatenate with trial data
    df_scores = pd.DataFrame.from_records(df_trials['user_attrs_scores'])
    df_results = pd.concat([df_scores, df_trials], axis=1)

    # Set the index and drop unnecessary columns
    df_results = df_results.set_index("number")
    df_results = df_results.drop([
        'datetime_complete', 
        "datetime_start", 
        "duration", 
        'value', 
        'user_attrs_scores'
        ], axis=1)

    # Sort the results by the favorite metric (test_f1 in this case)
    df_results = df_results.sort_values('test_f1', ascending=False)

    # Apply some styling to the DataFrame
    df_style = df_results.style
    df_style.bar(df_results.columns[:2], color='LightSalmon', width=50, height=20)
    df_style.format(precision=2, subset=df_results.columns[:2])
    df_style.highlight_max(
        subset=df_results.columns[2:7], 
        props="background-color:lightblue;color:black"
        )

    # Plot the optimization history
    fig = optuna.visualization.plot_optimization_history(study)
    fig.show()

    # Display the styled DataFrame
    display(df_style)

# Print information about the best trial
print(yaml.safe_dump({
    'best_trial': study.best_trial.number, 
    'params': study.best_params, 
    'scores': study.best_trial.user_attrs['scores'],
}))

# Display the study results and plots
display_study(study)


best_trial: 49
params:
  C: 9.517286338692262
  calibration: sigmoid
  loss: hinge
  lowercase: false
  max_ngram_char: 5
  max_ngram_word: 2
  tol: 1.737479413052696e-05
  use_idf: false
scores:
  fit_time: 8.66616473197937
  score_time: 0.9940863132476807
  test_accuracy: 0.9386274606777597
  test_f1: 0.9372405327308085
  test_precision: 0.958233745473023
  test_recall: 0.9172098499974402
  test_roc_auc: 0.981777593848566



Unnamed: 0_level_0,fit_time,score_time,test_recall,test_precision,test_f1,test_accuracy,test_roc_auc,params_C,params_calibration,params_loss,params_lowercase,params_max_ngram_char,params_max_ngram_word,params_tol,params_use_idf,state
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
29,10.38,2.78,0.918777,0.956998,0.937472,0.938758,0.981549,1.777705,sigmoid,hinge,False,5,2,1.3e-05,False,COMPLETE
49,8.67,0.99,0.91721,0.958234,0.937241,0.938627,0.981778,9.517286,sigmoid,hinge,False,5,2,1.7e-05,False,COMPLETE
31,8.34,2.86,0.917733,0.955158,0.936028,0.937322,0.981001,2.253035,sigmoid,squared_hinge,False,5,2,2.6e-05,False,COMPLETE
30,8.31,2.8,0.917211,0.954884,0.935618,0.93693,0.980809,1.870928,sigmoid,squared_hinge,False,5,2,2.7e-05,False,COMPLETE
46,7.91,3.87,0.917211,0.954367,0.93537,0.936669,0.980739,1.774359,sigmoid,squared_hinge,False,5,2,1.8e-05,False,COMPLETE
16,10.02,3.72,0.910159,0.961373,0.935001,0.9368,0.980104,4.519272,sigmoid,hinge,False,4,3,0.043806,False,COMPLETE
17,9.7,3.75,0.910159,0.961105,0.934876,0.936669,0.980108,5.481265,sigmoid,hinge,False,4,3,0.07526,False,COMPLETE
47,9.44,2.43,0.916688,0.953043,0.934464,0.935755,0.980426,1.402909,sigmoid,squared_hinge,False,5,2,0.000129,False,COMPLETE
14,7.19,2.08,0.907286,0.963374,0.93443,0.936408,0.980288,8.836121,sigmoid,hinge,False,5,3,0.099597,False,COMPLETE
15,7.09,2.37,0.907286,0.963374,0.93443,0.936408,0.980287,4.094959,sigmoid,hinge,False,5,3,0.081921,False,COMPLETE
