In [None]:
import sys

sys.path.append("../")

# Optuna hyperparameter optimization notebook for the number of features in the TF-IDF vectorizer

import logging

import optuna
from sklearn.metrics import f1_score

from src.rf import evaluate_classifier, load_data, train_classifier
from src.tfidf import main as tfidf_main

# Keep track of previous values to avoid recomputing
previous_trials = dict()


# Define the objective function
def objective(trial):
    # Define hyperparameters space
    tfidf_cfg = {
        "train_file": "../models/rf/train/processed.parquet",
        "test_file": "../models/rf/test/processed.parquet",
        "output_train_tfidf": "../models/rf/train/tfidf.parquet",
        "output_test_tfidf": "../models/rf/test/tfidf.parquet",
        "tfidf_vectorizer": "../models/rf/tfidf_vectorizer.pickle",
        "tfidf_params": {
            "max_features": trial.suggest_int("max_features", 1000, 10000, step=1000),
        },
    }
    rf_cfg = {
        "model_params": {
            "n_estimators": 100,
            "criterion": "entropy",
            "max_depth": 500,
            "max_features": "sqrt",
            "max_leaf_nodes": None,
            "min_samples_split": 5,
            "min_samples_leaf": 1,
            "bootstrap": True,
            "random_state": 42,
            "n_jobs": -1,
            "verbose": 0,
        },
    }
    if tfidf_cfg["tfidf_params"]["max_features"] in previous_trials:
        return previous_trials[tfidf_cfg["tfidf_params"]["max_features"]]

    # Compute the TF-IDF vectors
    logging.info(f"Max features: {tfidf_cfg['tfidf_params']['max_features']}")
    tfidf_main(cfg=tfidf_cfg)

    # Load the dataset
    X, y = load_data("../models/rf/train/tfidf.parquet")
    X_test, y_test = load_data("../models/rf/test/tfidf.parquet")

    # Train the classifier
    clf = train_classifier(X, y, rf_cfg["model_params"])

    # Evaluate the classifier
    y_pred = clf.predict(X_test)
    score = f1_score(y_test, y_pred, average="weighted")
    logging.info(f"F1: {score}")

    # Save the score for the current hyperparameter value
    previous_trials[tfidf_cfg["tfidf_params"]["max_features"]] = score

    return score


# Start the optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

# Print the results
logging.info(f"Number of finished trials: {len(study.trials)}")
logging.info(f"Best trial:")
trial = study.best_trial
logging.info(f"  F1: {trial.value}")
logging.info("  Params: ")
for key, value in trial.params.items():
    logging.info(f"    {key}: {value}")