In [30]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../src')

import optuna
import logging
from rich.logging import RichHandler

# Initialize a colored logger
optuna.logging.disable_default_handler()
logger = logging.getLogger("optuna")
if not logger.hasHandlers():
    logger.addHandler(RichHandler())


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
import numpy as np
from sklearn.pipeline import Pipeline
from functools import partial
from helper import create_model, load_data, score_model

N_TRIALS = 10
N_JOBS = 3
SEED = 796856567

def get_hyperparams(trial: optuna.Trial) -> dict:
    # For TF-IDF
    max_ngram_word = trial.suggest_int("max_ngram_word", 1, 3)
    max_ngram_char = trial.suggest_int("max_ngram_char", 1, 5)
    use_idf = trial.suggest_categorical("use_idf", [True, False])
    lowercase = trial.suggest_categorical("lowercase", [True, False])

    # For
    C = trial.suggest_float("C", 1e-7, 10, log=True)
    loss = trial.suggest_categorical("loss", ["hinge", "squared_hinge"])
    tol = trial.suggest_float("tol", 1e-5, 1e-1, log=True)

    hyperparams = {
        "tfidf__word__ngram_range": (1, max_ngram_word),
        "tfidf__char__ngram_range": (1, max_ngram_char),
        "tfidf__word__lowercase": lowercase,
        "tfidf__char__lowercase": lowercase,
        "tfidf__word__use_idf": use_idf,
        "tfidf__char__use_idf": use_idf,
        "cls__estimator__C": C,
        "cls__estimator__loss": loss,
        "cls__estimator__tol": tol,
    }

    return hyperparams


def objective(X_train, y_train, trial: optuna.Trial):
    hyperparams = get_hyperparams(trial)
    model: Pipeline = create_model(hyperparams)

    scores = score_model(model, X_train, y_train)
    mean_scores = {}
    for metric, values in scores.items():
        mean_scores[metric] = values.mean()
    trial.set_user_attr("scores", mean_scores)
    
    f1_score = scores["test_roc_auc"]
    return min(np.mean(f1_score), np.median(f1_score))


def optimize(n_jobs=1, n_trials=3):
    study = optuna.create_study(
        direction="maximize",
        sampler=optuna.samplers.TPESampler(seed=SEED),
        storage="sqlite:///optuna.db",
        study_name="optimize-model",
        load_if_exists=True,
  
    )

    X_train, y_train = load_data("../data/train.parquet")
    objective_with_data = partial(objective, X_train, y_train)

    study.optimize(
        objective_with_data, 
        n_trials=n_trials,
        n_jobs=n_jobs
    
    )
    return study


study = optimize(N_JOBS, N_TRIALS)







































In [32]:
import pandas as pd
import yaml

def display_study(study: optuna.study.Study) -> None:
    df_trials: pd.DataFrame = study.trials_dataframe()
    df_trials = df_trials.dropna(ignore_index=True)

    # Concatenate trials and scores
    df_scores = pd.DataFrame.from_records(df_trials['user_attrs_scores'])
    df_resuls = pd.concat([df_scores, df_trials], axis=1)

    # Clear dataframe
    df_resuls = df_resuls.set_index("number")
    df_resuls = df_resuls.drop([
        'datetime_complete', 
        "datetime_start", 
        "duration", 
        'value', 
        'user_attrs_scores'
        ], axis=1)

    # Sort by favorite metric
    df_resuls = df_resuls.sort_values('test_f1', ascending=False)

    # Stylize with a little color
    df_style = df_resuls.style
    df_style.bar(df_resuls.columns[:2], color='LightSalmon', width=50, height=20)
    df_style.format(precision=2, subset=df_resuls.columns[:2])
    df_style.highlight_max(
        subset=df_resuls.columns[2:7], 
        props="background-color:lightblue;color:black"
        )

    # Plot experiment history
    fig = optuna.visualization.plot_optimization_history(study)
    fig.show()
    display(df_style)

print('Best Trial:', study.best_trial.number)
print("---")
print(yaml.dump(study.best_params))
print("---")
print(yaml.dump(study.best_trial.user_attrs))
display_study(study)

Best Trial: 68
---
C: 9.783081707940896
loss: hinge
lowercase: true
max_ngram_char: 5
max_ngram_word: 2
tol: 0.0003837000703754547
use_idf: false

---
scores:
  fit_time: 5.561326742172241
  score_time: 0.470765495300293
  test_accuracy: 0.9378441708605273
  test_f1: 0.9363507220470207
  test_precision: 0.9586971256958794
  test_recall: 0.9151224423624977
  test_roc_auc: 0.9810288194974944



Unnamed: 0_level_0,fit_time,score_time,test_recall,test_precision,test_f1,test_accuracy,test_roc_auc,params_C,params_loss,params_lowercase,params_max_ngram_char,params_max_ngram_word,params_tol,params_use_idf,state
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
70,6.44,0.43,0.915122,0.958697,0.936351,0.937844,0.981028,8.627414,hinge,True,5,2,8.5e-05,False,COMPLETE
78,6.55,0.45,0.915122,0.958697,0.936351,0.937844,0.981028,9.438271,hinge,True,5,2,4.7e-05,False,COMPLETE
77,7.23,0.47,0.915122,0.958697,0.936351,0.937844,0.981028,9.839755,hinge,True,5,2,3.6e-05,False,COMPLETE
76,6.54,0.45,0.915122,0.958697,0.936351,0.937844,0.981028,9.196527,hinge,True,5,2,3.7e-05,False,COMPLETE
68,5.56,0.47,0.915122,0.958697,0.936351,0.937844,0.981029,9.783082,hinge,True,5,2,0.000384,False,COMPLETE
64,4.69,0.45,0.915122,0.958697,0.936351,0.937844,0.981027,7.076685,hinge,True,5,2,0.000366,False,COMPLETE
73,6.81,0.45,0.915122,0.958697,0.936351,0.937844,0.981028,9.725129,hinge,True,5,2,4.1e-05,False,COMPLETE
66,5.96,0.44,0.915122,0.958697,0.936351,0.937844,0.981028,8.924998,hinge,True,5,2,0.000106,False,COMPLETE
61,5.83,0.44,0.915122,0.958697,0.936351,0.937844,0.981028,7.921877,hinge,True,5,2,0.000102,False,COMPLETE
65,5.84,0.46,0.914861,0.958686,0.936211,0.937714,0.981028,9.639081,hinge,True,5,2,9.6e-05,False,COMPLETE
