In [1]:
import sys
sys.path.append('../')

# Optuna hyperparameter optimization notebook

import optuna
from src.rf import load_data, train_classifier, evaluate_classifier
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

# Load the dataset
X, y = load_data("../models/rf/train/tfidf.parquet")

# Define the objective function
def objective(trial):
    
    # Define hyperparameters space
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 400, step=50),
        "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
        "max_depth": trial.suggest_categorical("max_depth", [None, 100, 200, 300, 400, 500]),
        "max_features": trial.suggest_categorical("max_features", ["sqrt"]),
        "max_leaf_nodes": trial.suggest_categorical("max_leaf_nodes", [None, 200, 400]),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10, step=1),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10, step=1),
        "bootstrap": trial.suggest_categorical("bootstrap", [True]),
        "random_state": 42,
        "n_jobs": -1,
        "verbose": 0
    }
    
    # Initialize 5-fold stratified cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    
    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        clf = train_classifier(X_train, y_train, params)
        y_pred, _, metrics = evaluate_classifier(clf, X_val, y_val)
        scores.append(metrics["f1"])
    
    # Return the average F1 score over all folds
    return sum(scores) / len(scores)

# Start the optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

# Print the results
print(f"Number of finished trials: {len(study.trials)}")
print(f"Best trial:")
trial = study.best_trial
print(f"  F1: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


2024-04-07 14:49:46 [INFO] Loading data from ../models/rf/train/tfidf.parquet
[I 2024-04-07 14:49:47,635] A new study created in memory with name: no-name-a5092783-202e-4ee0-8d60-28941a36ed38
2024-04-07 14:49:48 [INFO] Training Random Forest classifier
2024-04-07 14:50:07 [INFO] Successfully trained Random Forest classifier
2024-04-07 14:50:07 [INFO] Evaluating classifier
2024-04-07 14:50:08 [INFO] Accuracy: 0.991883
2024-04-07 14:50:08 [INFO] Precision: 0.985139
2024-04-07 14:50:08 [INFO] Recall: 0.948776
2024-04-07 14:50:08 [INFO] F1: 0.966616
2024-04-07 14:50:08 [INFO] ROC AUC: 0.998815
2024-04-07 14:50:09 [INFO] Training Random Forest classifier
2024-04-07 14:50:28 [INFO] Successfully trained Random Forest classifier
2024-04-07 14:50:28 [INFO] Evaluating classifier
2024-04-07 14:50:29 [INFO] Accuracy: 0.993469
2024-04-07 14:50:29 [INFO] Precision: 0.992175
2024-04-07 14:50:29 [INFO] Recall: 0.954819
2024-04-07 14:50:29 [INFO] F1: 0.973139
2024-04-07 14:50:29 [INFO] ROC AUC: 0.99867

Number of finished trials: 10
Best trial:
  F1: 0.9821502963856437
  Params: 
    n_estimators: 400
    criterion: gini
    max_depth: 300
    max_features: sqrt
    max_leaf_nodes: None
    min_samples_split: 6
    min_samples_leaf: 3
    bootstrap: True
