# Hyperparameter Optimization

This notebook uses Optuna, a hyperparameter optimization framework, to optimize the hyperparameters.

In [2]:
# Load the autoreload extension to automatically reload modules when they are modified.
%load_ext autoreload

# Configure the autoreload extension to automatically reload imported modules.
%autoreload 2

# Add the path '../src' to the module search path.
import sys
sys.path.append('../src')

# Import the optuna library for hyperparameter optimization.
import optuna
import logging
from rich.logging import RichHandler

# Get the logger named "optuna" and add a RichHandler to it for colored logging.
optuna.logging.disable_default_handler()
logger = logging.getLogger("optuna")
if not logger.hasHandlers():
    logger.addHandler(RichHandler())


  from .autonotebook import tqdm as notebook_tqdm


###  Hyperparameter Optimization using Optuna

1. **`get_hyperparams` Function:** This function defines a search space for hyperparameters, including TF-IDF and SVM-related parameters. It returns a dictionary containing the hyperparameters to be optimized.

2. **`objective` Function:** This function defines the objective to be minimized during optimization. It creates a model pipeline with hyperparameters, scores the model on training data, and sets the mean scores as user attributes in the Optuna trial. The optimization objective is the minimum of the mean ROC AUC score.

3. **`optimize` Function:** This function creates an Optuna study, loads training data, and runs the optimization study using the defined objective function. The study is configured for parallel execution (`n_jobs`) and a specified number of trials (`n_trials`).

`Note`:  
A SQLite database is created to store the trial results, allowing the cell to be executed multiple times, resuming from the last trial. To start a new study, simply delete the existing database (./optuna.db).

In [3]:
# Import necessary libraries
import numpy as np
from sklearn.pipeline import Pipeline
from functools import partial
import logging

# Import custom helper functions
from helper import create_model, load_data, score_model

# Define constants for optimization
N_TRIALS = 10
N_JOBS = 3
SEED = 796856567


def get_hyperparams(trial: optuna.Trial) -> dict:
    # For TF-IDF
    max_ngram_word = trial.suggest_int("max_ngram_word", 1, 3)
    max_ngram_char = trial.suggest_int("max_ngram_char", 1, 5)
    use_idf = trial.suggest_categorical("use_idf", [True, False])
    lowercase = trial.suggest_categorical("lowercase", [True, False])

    # For SVM
    C = trial.suggest_float("C", 1e-7, 10, log=True)
    loss = trial.suggest_categorical("loss", ["hinge", "squared_hinge"])
    tol = trial.suggest_float("tol", 1e-5, 1e-1, log=True)

    calibration = trial.suggest_categorical("calibration", ["isotonic", "sigmoid"])

    # Define hyperparameters dictionary
    hyperparams = {
        "tfidf__word__ngram_range": (1, max_ngram_word),
        "tfidf__char__ngram_range": (1, max_ngram_char),
        "tfidf__word__lowercase": lowercase,
        "tfidf__char__lowercase": lowercase,
        "tfidf__word__use_idf": use_idf,
        "tfidf__char__use_idf": use_idf,
        "cls__method": calibration,
        "cls__estimator__C": C,
        "cls__estimator__loss": loss,
        "cls__estimator__tol": tol,
    }

    return hyperparams


def objective(X_train, y_train, trial: optuna.Trial):
    # Get hyperparameters from Optuna trial
    hyperparams = get_hyperparams(trial)
    
    # Create a model pipeline with the specified hyperparameters
    model: Pipeline = create_model(hyperparams)

    # Score the model on training data
    scores = score_model(model, X_train, y_train)

    # Calculate mean scores for each metric
    mean_scores = {}
    for metric, values in scores.items():
        mean_scores[metric] = values.mean()
    
    # Set the user attribute "scores" in the trial
    trial.set_user_attr("scores", mean_scores)
    
    # Use the ROC AUC score as the optimization objective
    f1_score = scores["test_roc_auc"]
    return min(np.mean(f1_score), np.median(f1_score))

def optimize(n_jobs=1, n_trials=3):
    # Create an Optuna study for optimization
    study = optuna.create_study(
        direction="maximize",
        sampler=optuna.samplers.TPESampler(seed=SEED),
        storage="sqlite:///optuna.db",
        study_name="optimize-model",
        load_if_exists=True,
    )

    # Load training data
    X_train, y_train = load_data("../data/train.parquet")
    
    # Partially apply the objective function with training data
    objective_with_data = partial(objective, X_train, y_train)

    # Run the optimization study
    study.optimize(
        objective_with_data, 
        n_trials=n_trials,
        n_jobs=n_jobs
    )
    
    return study

# Run the optimization with specified number of jobs and trials
study = optimize(N_JOBS, N_TRIALS)


### Results and Visualizations

This cell provides a comprehensive view of hyperparameter optimization results, making it easier to analyze and interpret the performance of different hyperparameter configurations.

In [13]:
# Import necessary libraries
import pandas as pd
import yaml

def display_study(study: optuna.study.Study) -> None:
    # Convert trials to a DataFrame
    df_trials: pd.DataFrame = study.trials_dataframe()
    df_trials = df_trials.dropna(ignore_index=True)

    # Extract scores from user_attrs and concatenate with trial data
    df_scores = pd.DataFrame.from_records(df_trials['user_attrs_scores'])
    df_results = pd.concat([df_scores, df_trials], axis=1)

    # Set the index and drop unnecessary columns
    df_results = df_results.set_index("number")
    df_results = df_results.drop([
        'datetime_complete', 
        "datetime_start", 
        "duration", 
        'value', 
        'user_attrs_scores'
        ], axis=1)

    # Sort the results by the favorite metric (test_f1 in this case)
    df_results = df_results.sort_values('test_f1', ascending=False)

    # Apply some styling to the DataFrame
    df_style = df_results.style
    df_style.bar(df_results.columns[:2], color='LightSalmon', width=50, height=20)
    df_style.format(precision=2, subset=df_results.columns[:2])
    df_style.highlight_max(
        subset=df_results.columns[2:7], 
        props="background-color:lightblue;color:black"
        )

    # Plot the optimization history
    fig = optuna.visualization.plot_optimization_history(study)
    fig.show()

    # Display the styled DataFrame
    display(df_style)

# Print information about the best trial
print(yaml.safe_dump({
    'best_trial': study.best_trial.number, 
    'params': study.best_params, 
    'scores': study.best_trial.user_attrs['scores'],
}))

# Display the study results and plots
display_study(study)


best_trial: 38
params:
  C: 9.955531930610052
  calibration: isotonic
  loss: squared_hinge
  lowercase: false
  max_ngram_char: 5
  max_ngram_word: 1
  tol: 0.04684368554771205
  use_idf: false
scores:
  fit_time: 2.308107280731201
  score_time: 0.609937334060669
  test_accuracy: 0.9366689803248104
  test_f1: 0.9357027650353651
  test_precision: 0.949199224802703
  test_recall: 0.9226973156538507
  test_roc_auc: 0.9814525825100517



Unnamed: 0_level_0,fit_time,score_time,test_recall,test_precision,test_f1,test_accuracy,test_roc_auc,params_C,params_calibration,params_loss,params_lowercase,params_max_ngram_char,params_max_ngram_word,params_tol,params_use_idf,state
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
57,3.71,0.65,0.921915,0.950482,0.935858,0.93693,0.981297,4.185777,isotonic,squared_hinge,False,5,1,0.0002,False,COMPLETE
53,4.02,0.95,0.921915,0.950482,0.935858,0.93693,0.98133,4.303376,isotonic,squared_hinge,False,5,1,0.000695,False,COMPLETE
38,2.31,0.61,0.922697,0.949199,0.935703,0.936669,0.981453,9.955532,isotonic,squared_hinge,False,5,1,0.046844,False,COMPLETE
35,1.38,0.3,0.912251,0.960419,0.93565,0.937323,0.976985,8.639775,sigmoid,squared_hinge,True,3,1,0.032766,True,COMPLETE
13,1.53,0.64,0.91199,0.96041,0.935509,0.937192,0.976991,7.931461,sigmoid,squared_hinge,True,3,1,0.09616,True,COMPLETE
58,3.05,0.55,0.921654,0.949415,0.935278,0.936278,0.981333,3.62701,isotonic,squared_hinge,False,5,1,0.000242,False,COMPLETE
56,3.36,0.73,0.921654,0.949415,0.935278,0.936278,0.981328,3.685585,isotonic,squared_hinge,False,5,1,0.000279,False,COMPLETE
34,1.18,0.41,0.911728,0.959077,0.934755,0.936408,0.976869,2.030013,sigmoid,squared_hinge,True,3,1,0.088643,True,COMPLETE
55,3.24,0.95,0.919565,0.950364,0.934614,0.935756,0.981331,3.834706,isotonic,squared_hinge,False,5,1,0.000674,False,COMPLETE
52,2.89,0.7,0.919043,0.950318,0.934352,0.935494,0.980711,1.891447,isotonic,squared_hinge,False,5,1,0.000809,False,COMPLETE
