In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json
import time
import plotly
from datetime import datetime
import optuna
import sklearn
from sklearnex import patch_sklearn
patch_sklearn()

In [None]:
SEED = 10
limit_rows = 1000

In [None]:
print(f"Execution started at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}.")

In [None]:
def load_csv(name):
    if limit_rows is None:
        full_filename = f'../Data Preprocessing/sklearn/full/iot23_combined_{name}.csv'
    else:
        full_filename = f'../Data Preprocessing/sklearn/partial/iot23_combined_{int(limit_rows/1000)}k_{name}.csv'
    
    df = pd.read_table(filepath_or_buffer=full_filename, header=None, sep=',').infer_objects().to_numpy()
    
    return df.ravel() if df.shape[1] == 1 else df

In [None]:
X_train, X_test, y_train, y_test = load_csv('X_train'), load_csv('X_test'), load_csv('y_train'), load_csv('y_test')

print('X_train',X_train.shape,'\ny_train',y_train.shape)
print('X_test',X_test.shape,'\ny_test',y_test.shape)

In [None]:
optuna.logging.set_verbosity(optuna.logging.DEBUG)

'''
def print_trial_callback(study,trial):
    try:
        print(f"[{str(trial.number).rjust(6,'0')}] {str(trial.state).ljust(20)}\tScore = {str(np.round(trial.values[0],9)).ljust(12,'0')}\tClassifier = {trial.params['classifier_name']}\n")
    except:
        print(f"[{str(trial.number).rjust(6,'0')}] {str(trial.state).ljust(20)}\tScore = {str(np.round(float('nan'))).ljust(12,' ')}\tClassifier = {trial.params['classifier_name']}\n")
'''

In [None]:
print(f"Optimization started at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}.")

In [None]:
from sklearn.kernel_approximation import Nystroem
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import PassiveAggressiveClassifier, Perceptron
from sklearn.naive_bayes import ComplementNB, GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

best_results = {}

classifier_names = [
    'AdaBoostClassifier',
    'ComplementNB',
    'DecisionTreeClassifier',
    'ExtraTreesClassifier',
    'GaussianNB',
    'GaussianProcessClassifier',
    'KNeighborsClassifier',
    'LinearDiscriminantAnalysis',
    'LinearSVC',
    'MLPClassifier',
    'MultinomialNB',
    'NystroemLinearSVC',
    'PassiveAggressiveClassifier',
    'QuadraticDiscriminantAnalysis',
    'RandomForestClassifier',
    'XGBClassifier'
]

def objective(trial):

    classifier_name = trial.suggest_categorical('classifier_name', classifier_names)

    if classifier_name      == 'AdaBoostClassifier':
        n_estimators         = trial.suggest_int('abc_n_estimators', 10, 200, 10)
        learning_rate        = trial.suggest_loguniform('abc_learning_rate', 1e-6, 1e0)
        classifier_obj       = AdaBoostClassifier(n_estimators=n_estimators,learning_rate=learning_rate)

    elif classifier_name    == 'ComplementNB':
        alpha                = trial.suggest_discrete_uniform('cnb_alpha', 0.1, 1.0, 0.1)
        fit_prior            = trial.suggest_categorical('cnb_fit_prior', [False, True])
        norm                 = trial.suggest_categorical('cnb_norm', [False, True])
        classifier_obj       = ComplementNB(alpha=alpha,fit_prior=fit_prior,norm=norm)

    elif classifier_name    == 'DecisionTreeClassifier':
        criterion            = trial.suggest_categorical('dtc_criterion', ['gini', 'entropy'])
        splitter             = trial.suggest_categorical('dtc_splitter', ['best', 'random'])
        min_samples_split    = trial.suggest_int('dtc_min_samples_split', 2, 50)
        min_samples_leaf     = trial.suggest_int('dtc_min_samples_leaf', 1, 50)
        max_features         = trial.suggest_int('dtc_max_features', 1, X_train.shape[1])
        classifier_obj       = DecisionTreeClassifier(criterion=criterion, 
                                                      splitter=splitter, 
                                                      min_samples_split=min_samples_split,
                                                      min_samples_leaf=min_samples_leaf,
                                                      max_features=max_features)

    elif classifier_name    == 'ExtraTreesClassifier':
        n_estimators         = trial.suggest_int('etc_n_estimators', 10, 200, 10)
        criterion            = trial.suggest_categorical('etc_criterion', ['gini', 'entropy'])
        min_samples_split    = trial.suggest_int('etc_min_samples_split', 2, 50)
        min_samples_leaf     = trial.suggest_int('etc_min_samples_leaf', 1, 50)
        max_features         = trial.suggest_int('etc_max_features', 1, X_train.shape[1])
        bootstrap            = trial.suggest_categorical('etc_bootstrap', [False, True])
        classifier_obj       = ExtraTreesClassifier(n_estimators=n_estimators,
                                                    criterion=criterion, 
                                                    min_samples_split=min_samples_split,
                                                    min_samples_leaf=min_samples_leaf,
                                                    max_features=max_features,
                                                    bootstrap=bootstrap)

    elif classifier_name    == 'GaussianNB':
        var_smoothing        = trial.suggest_loguniform('gnb_var_smoothing', 1e-12, 1e0)
        classifier_obj       = GaussianNB(var_smoothing=var_smoothing)

    elif classifier_name    == 'GaussianProcessClassifier':
        max_iter_predict     = trial.suggest_int('gpc_max_iter_predict', 50, 200, 50)
        multi_class          = trial.suggest_categorical('gpc_multi_class', ['one_vs_one', 'one_vs_rest'])
        classifier_obj       = GaussianProcessClassifier(max_iter_predict=max_iter_predict,
                                                         multi_class=multi_class)

    elif classifier_name    == 'KNeighborsClassifier':
        n_neighbors          = trial.suggest_int('knc_n_neighbors', 10, 100, 10)
        leaf_size            = trial.suggest_int('knc_leaf_size', 10, 100, 10)
        classifier_obj       = KNeighborsClassifier(n_neighbors=n_neighbors,leaf_size=leaf_size)

    elif classifier_name    == 'LinearDiscriminantAnalysis':
        n_features,n_classes = X_train.shape[1],len(set(y_train))
        n_components         = trial.suggest_int('lda_n_components', 1, min(n_features, n_classes-1))
        classifier_obj       = LinearDiscriminantAnalysis(n_components=n_components)

    elif classifier_name    == 'LinearSVC':
        dual                 = trial.suggest_categorical('lsvc_dual', [False])
        C                    = trial.suggest_loguniform('lsvc_C', 1e-6, 1e3)
        classifier_obj       = LinearSVC(dual=dual,C=C)

    elif classifier_name    == 'MLPClassifier':
        create_hidden_layers = lambda value,count : tuple([int(value*2**(count-i-1)) for i in range(0,count)])
        hidden_layer_count   = trial.suggest_int('mlpc_hidden_layer_count', 1, 3, 1)
        hidden_layer_sizes   = create_hidden_layers(2*X_train.shape[1], hidden_layer_count)
        learning_rate        = trial.suggest_categorical('mlpc_learning_rate', ['constant', 'invscaling', 'adaptive'])
        learning_rate_init   = trial.suggest_loguniform('mlpc_learning_rate_init', 1e-6, 1e0)
        max_iter             = trial.suggest_int('mplc_max_iter', 250, 1000, 250)
        classifier_obj       = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes,
                                             learning_rate=learning_rate,
                                             learning_rate_init=learning_rate_init,
                                             max_iter=max_iter)

    elif classifier_name    == 'MultinomialNB':
        alpha                = trial.suggest_discrete_uniform('mnb_alpha', 0.1, 1.0, 0.1)
        fit_prior            = trial.suggest_categorical('mnb_fit_prior', [False, True])
        classifier_obj       = MultinomialNB(alpha=alpha,fit_prior=fit_prior)

    elif classifier_name    == 'NystroemLinearSVC':
        approx_gamma         = trial.suggest_discrete_uniform("nystroem_gamma", 0.1, 1.0, 0.1)
        approx_n_components  = trial.suggest_int("nystroem_n_components", 50, 200, 25)            
        dual                 = trial.suggest_categorical('dual', [False])
        C                    = trial.suggest_loguniform('C', 1e-6, 1e3)

        classifier_obj = LinearSVC(dual=dual,C=C)


    elif classifier_name    == 'PassiveAggressiveClassifier':
        C                    = trial.suggest_loguniform('pac_C', 1e-6, 1e3)
        early_stopping       = trial.suggest_categorical('pac_early_stopping', [False, True])
        validation_fraction  = trial.suggest_categorical('pac_validation_fraction', [0.25])
        n_iter_no_change     = trial.suggest_int('pac_n_iter_no_change', 5, 20, 5)
        classifier_obj       = PassiveAggressiveClassifier(C=C,
                                                           early_stopping=early_stopping,
                                                           validation_fraction=validation_fraction,
                                                           n_iter_no_change=n_iter_no_change)

    elif classifier_name    == 'Perceptron':
        penalty              = trial.suggest_categorical('p_penalty', ['l2,', 'l1', 'elasticnet'])
        alpha                = trial.suggest_discrete_uniform('p_alpha', 0.1, 1.0, 0.1)
        eta0                 = trial.suggest_discrete_uniform('p_eta0', 0.5, 2.0, 0.5)
        early_stopping       = trial.suggest_categorical('p_early_stopping', [False, True])
        validation_fraction  = trial.suggest_categorical('p_validation_fraction', [0.25])
        n_iter_no_change     = trial.suggest_int('p_n_iter_no_change', 5, 20, 5)
        classifier_obj       = Perceptron(penalty=penalty,
                                          alpha=alpha,
                                          eta0=eta0,
                                          early_stopping=early_stopping,
                                          n_iter_no_change=n_iter_no_change)

    elif classifier_name    == 'QuadraticDiscriminantAnalysis':
        reg_param            = trial.suggest_discrete_uniform('qda_reg_param', 0.1, 1.0, 0.1)
        classifier_obj       = QuadraticDiscriminantAnalysis(reg_param=reg_param)

    elif classifier_name    == 'RandomForestClassifier':
        n_estimators         = trial.suggest_int('rfc_n_estimators', 10, 200, 10)
        criterion            = trial.suggest_categorical('criterion', ['gini', 'entropy'])
        min_samples_split    = trial.suggest_int('rfc_min_samples_split', 2, 50)
        min_samples_leaf     = trial.suggest_int('rfc_min_samples_leaf', 1, 50)
        max_features         = trial.suggest_int('rfc_max_features', 1, X_train.shape[1])
        bootstrap            = trial.suggest_categorical('rfc_bootstrap', [False, True])
        classifier_obj       = RandomForestClassifier(n_estimators=n_estimators,
                                                      criterion=criterion, 
                                                      min_samples_split=min_samples_split,
                                                      min_samples_leaf=min_samples_leaf,
                                                      max_features=max_features,
                                                      bootstrap=bootstrap)
    else: # classifier_name == 'XGBClassifier'
        n_estimators         = trial.suggest_int('xgbc_n_estimators', 10, 200, 10)
        use_label_encoder    = trial.suggest_categorical('xgbc_use_label_encoder', [False])
        learning_rate        = trial.suggest_loguniform('xgbc_learning_rate', 1e-6, 1e0)
        booster              = trial.suggest_categorical('xgbc_booster', ['gbtree', 'gblinear', 'dart'])
        gamma                = trial.suggest_loguniform('xgbc_gamma', 1e-6, 1e0)
        classifier_obj       = XGBClassifier(n_estimators=n_estimators, 
                                             use_label_encoder=use_label_encoder,
                                             learning_rate=learning_rate,
                                             booster=booster,
                                             gamma=gamma)
        
    # fit, predict and evaluate
    if classifier_name == 'NystroemLinearSVC':
        feature_mapper = Nystroem(gamma=approx_gamma,n_components=approx_n_components).fit(X_train)
        classifier_obj.fit(feature_mapper.transform(X_train), y_train)
        y_pred = classifier_obj.predict(feature_mapper.transform(X_test))
    else:
        classifier_obj.fit(X_train, y_train)
        y_pred = classifier_obj.predict(X_test)

    return sklearn.metrics.accuracy_score(y_test, y_pred)

In [None]:
print(f"Optimization started at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}.")

In [None]:
study = optuna.create_study(direction="maximize")

#study.optimize(objective, n_trials=n_cpus-1, n_jobs=n_cpus-1, catch=(ValueError,))
#study.optimize(objective, timeout=60*60*len(classifier_names)/2, n_jobs=n_cpus, catch=(ValueError,))
study.optimize(objective, n_trials=4*len(classifier_names), n_jobs=8, catch=(ValueError,))

In [None]:
print(f"Optimization finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}.")

In [None]:
optuna.visualization.plot_optimization_history(study).show()

In [None]:
# for each classifier, sort results according to score (descending)
best_results = dict(sorted(best_results.items()))
for key,value in best_results.items():
    best_results[key] = sorted(best_results[key], key=lambda d: d['score'], reverse=True) 

# print the best results found for each classifier
for key,value in best_results.items():
    print(key,json.dumps(value[0], indent=4, default=str))

# persist results to filesystem    
with open('IoT23 - AutoML - Optuna - Sklearn - Mixed Parallel.json', 'w') as fp:
    json.dump(best_results, fp)

In [None]:
plt.figure(figsize=(16,9))
idx = 0
for key, value in best_results.items():
    name_i = key
    value_i = best_results[key][0]['score']
    plt.bar(name_i,value_i)
    plt.text(idx-0.1,value_i+0.01,f'{100*value_i:.1f}%')
    idx += 1
plt.xticks(rotation=45, ha='right')
plt.xticks(range(0,len(best_results)),best_results.keys())
plt.yticks(np.linspace(0,1,11))
plt.ylim(0,1.05)
plt.show()

In [None]:
print(f"Execution finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}.")