In [2]:
import optuna

import numpy as np
import pandas as pd
from utils import load_final_data
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler


In [3]:
X_train, X_val, X_test, y_train, y_val, y_test = load_final_data()

In [4]:
work_features = [c for c in X_train.columns if not c.startswith('Cluster_number')]

X_train = X_train[work_features]
X_val = X_val[work_features]
X_test = X_test[work_features]

y_train = y_train.values.argmax(1)
y_val = y_val.values.argmax(1)
y_test = y_test.values.argmax(1)

In [6]:
def objective(trial):
    param = {
        'solver':trial.suggest_categorical('solver',['newton-cg', 'lbfgs','sag', 'saga']),
        'penalty':trial.suggest_categorical('penalty',['l2','none']),
        'multi_class':trial.suggest_categorical('multi_class',['multinomial','ovr']),
        'C':trial.suggest_float('C',.01, 10)
    }

    penalty = trial.suggest_categorical('penalty',['l2','none'])
    solver = trial.suggest_categorical('solver',['newton-cg', 'lbfgs','sag',
                                               'saga'])
    multi_class=trial.suggest_categorical('multi_class',['multinomial','ovr'])
    C=trial.suggest_float('C',.01, 10)

    clf=LogisticRegression(penalty=penalty,solver=solver,
                                 multi_class=multi_class,C=C,max_iter=10)

    # scalers = trial.suggest_categorical("scalers", ['minmax', 'standard', 'robust'])
    #
    # # (b) Define your scalers
    # if scalers == "minmax":
    #     scaler = MinMaxScaler()
    # elif scalers == "standard":
    #     scaler = StandardScaler()
    # else:
    #     scaler = RobustScaler()
    #
    # # -- Instantiate dimensionality reduction
    #  # (a) List all dimensionality reduction options
    # dim_red = trial.suggest_categorical("dim_red", ["PCA", None])
    #
    # # (b) Define the PCA algorithm and its hyperparameters
    # if dim_red == "PCA":
    #     pca_n_components=trial.suggest_int("pca_n_components", 2, 30) # suggest an integer from 2 to 30
    #     dimen_red_algorithm=PCA(n_components=pca_n_components)
    # # (c) No dimensionality reduction option
    # else:
    #     dimen_red_algorithm='passthrough'
    #
    # pipeline = make_pipeline(scaler, dimen_red_algorithm, clf)
    clf.fit(X_train,y_train)
    # -- Evaluate the score by cross-validation
    # score = cross_val_score(pipeline, X_val, y_val, scoring='f1')
    # f1 = score.mean() # calculate the mean of scores
    y_pred=clf.predict(X_val)
    return f1_score(y_val, y_pred, average='weighted')

In [9]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, timeout=600)

# print("Number of finished trials: ", len(study.trials))
# print("Best trial:")
# trial = study.best_trial

# print("  Value: {}".format(trial.value))
# print("  Params: ")
# for key, value in trial.params.items():
#     print("    {}: {}".format(key, value))

[32m[I 2023-01-18 20:20:52,786][0m A new study created in memory with name: no-name-f8944336-20f7-4b07-acb9-646484ea9aec[0m
[32m[I 2023-01-18 20:21:26,610][0m Trial 0 finished with value: 0.44633007944167535 and parameters: {'solver': 'newton-cg', 'penalty': 'l2', 'multi_class': 'multinomial', 'C': 5.172097011022454}. Best is trial 0 with value: 0.44633007944167535.[0m
  "Setting penalty='none' will ignore the C and l1_ratio parameters"
[32m[I 2023-01-18 20:22:34,162][0m Trial 1 finished with value: 0.44015357082277173 and parameters: {'solver': 'saga', 'penalty': 'none', 'multi_class': 'ovr', 'C': 7.710790540694397}. Best is trial 0 with value: 0.44633007944167535.[0m
[32m[I 2023-01-18 20:23:10,357][0m Trial 2 finished with value: 0.4447416675173431 and parameters: {'solver': 'saga', 'penalty': 'l2', 'multi_class': 'multinomial', 'C': 9.408085045597984}. Best is trial 0 with value: 0.44633007944167535.[0m
  "Setting penalty='none' will ignore the C and l1_ratio parameters"

In [11]:
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials:  16
Best trial:
  Value: 0.44633007944167535
  Params: 
    solver: newton-cg
    penalty: l2
    multi_class: multinomial
    C: 5.172097011022454


In [None]:
import matplotlib.pyplot as plt
label_categories = list(y_train.columns)

plt.figure(figsize = (8,6))

y_test_numeric = y_test.to_numpy().argmax(1)
m = sklearn.metrics.confusion_matrix(y_test_numeric, preds)
ax = sns.heatmap(m, annot=True, fmt='d')
ax.set(xlabel="Predicted labels", ylabel="True labels",
       xticklabels=label_categories, yticklabels=label_categories)

plt.yticks(rotation=0)
plt.xticks(rotation=45)

plt.show()