## Optuna

We use optuna to find out the best hyperparameters for the model. We run it for 100 trials on a MacBook M2.

In [10]:
import sys

sys.path.append("../src")

import importer
import encoder
import cleaner
import normalizer
import splitter


raw_train_values, raw_train_labels, raw_test_values = importer.import_data(directory="../Data")
fitted_enc = encoder.create_encoder(raw_train_values)
train_data = encoder.encode(raw_train_values, fitted_enc)
test_data = encoder.encode(raw_test_values, fitted_enc)
train_cleaned = cleaner.clean(train_data, raw_train_labels)
train_normalized, test_data = normalizer.log_transform(train_cleaned, test_data)
train_normalized, test_data = normalizer.normalize(train_normalized, test_data)
X_train, X_val, y_train, y_val = splitter.split(train_normalized)

In [15]:
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report

def Optuna(X_train, y_train, X_test, y_test):

    #change categories of y to start from 0 bc softmax likes it that way
    y_train = y_train - 1
    y_test = y_test - 1
        
    # Define an objective function for Optuna to optimize
    def objective(trial):
        # Define the hyperparameters to search
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1.0),
            'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        }

        # Create an XGBoost classifier with the suggested hyperparameters
        clf = xgb.XGBClassifier(**params, objective='multi:softmax', num_class=3)

        # Fit the classifier to the training data
        clf.fit(X_train, y_train)

        # Predict on the test data
        y_pred = clf.predict(X_test)

        # Calculate the accuracy
        accuracy = accuracy_score(y_test, y_pred)

        # Calculate the micro F1 score
        micro_f1 = f1_score(y_test, y_pred, average='micro')

        return micro_f1  # Optimize for micro F1 score

    # Create an Optuna study and optimize the objective function
    study = optuna.create_study(direction='maximize')  # For micro F1, maximize the objective
    study.optimize(objective, n_trials=1)  # You can adjust the number of trials

    # Get the best hyperparameters
    best_params = study.best_params
    best_micro_f1 = study.best_value

    print(f"Best Hyperparameters: {best_params}")
    print(f"Best Micro F1 Score: {best_micro_f1}")

    # Train a final model with the best hyperparameters
    best_clf = xgb.XGBClassifier(**best_params, objective='multi:softmax', num_class=3)
    best_clf.fit(X_train, y_train)
    y_pred = best_clf.predict(X_test)
    final_micro_f1 = f1_score(y_test, y_pred, average='micro')
    final_accuracy = accuracy_score(y_test, y_pred)

    print(f"Final Micro F1 Score with Best Hyperparameters: {final_micro_f1}")
    print(f"Final Accuracy with Best Hyperparameters: {final_accuracy}")

    # Print a classification report with precision, recall, and F1-score for each class
    class_report = classification_report(y_test, y_pred)
    print("Classification Report:")
    print(class_report)


In [16]:
Optuna(X_train, y_train, X_val, y_val)

[I 2023-10-08 17:46:42,746] A new study created in memory with name: no-name-36acf7a4-2fe8-4913-8213-4cd937267929
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1.0),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
[I 2023-10-08 17:46:51,240] Trial 0 finished with value: 0.705253948519593 and parameters: {'n_estimators': 173, 'max_depth': 10, 'learning_rate': 0.00459871636438509, 'subsample': 0.5286556465630148, 'colsample_bytree': 0.5814754669240851}. Best is trial 0 with value: 0.705253948519593.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1.0),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
[I 2023-10-08 17:47:10,560] Trial 1 finished with value: 0.73991189697779 and parameters: {'n_estimators': 499, 'max_depth': 9, 'learning_rate': 0.03797488736964126, 'subsample'

TypeError: 'dict' object is not callable