In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
import autosklearn.classification
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import pickle


file_path = 'data/'
save_path = 'models/classification/'
model_name = 'automl__sklearn_classifier.sav'

lab = preprocessing.LabelEncoder()

In [None]:
def get_x_y_data(df):

    X_data = []
    y_data = []

    for _, row in df.iterrows():
        res = row['n_v'].strip('][ ').split()
        n_list = [float(s) for s in res]
        res = row['bins_v'].strip('][ ').split()
        bins_list = [float(s) for s in res]
        res = row['n_xv'].strip('][ ').split()
        n_xv_list = [float(s) for s in res]
        res = row['bins_xv'].strip('][ ').split()
        bins_xv_list = [float(s) for s in res]
        
        res = row['n_yv'].strip('][ ').split()
        n_yv_list = [float(s) for s in res]
        res = row['bins_yv'].strip('][ ').split()
        bins_yv_list = [float(s) for s in res]
        
        res = row['n_zv'].strip('][ ').split()
        n_zv_list = [float(s) for s in res]
        res = row['bins_zv'].strip('][ ').split()
        bins_zv_list = [float(s) for s in res]
        
        res = row['n_a_v'].strip('][ ').split()
        n_a_list = [float(s) for s in res]
        res = row['bins_a_v'].strip('][ ').split()
        bins_a_list = [float(s) for s in res]

        
        data_point=[a for a in zip(n_list, bins_list,  n_xv_list, bins_xv_list, n_yv_list, bins_yv_list, n_zv_list,
                                   bins_zv_list, n_a_list, bins_a_list)]
        X_data.append(data_point)
        y_data.append(row['h2s'])

    X = np.array(X_data)
    y = np.array(y_data)
    y_transformed = lab.fit_transform(y)

    nsamples, nx, ny = X.shape
    X = X.reshape(nsamples, nx*ny)

    return X, y_transformed

In [None]:
train_df = pd.read_csv(file_path + 'behaviour_data.csv')
test_df = pd.read_csv(file_path + 'test_behaviour_data.csv')

X_train, y_train = get_x_y_data(train_df)
X_test, y_test = get_x_y_data(test_df)

# Load sklearn autoML and train model

In [None]:
cls = autosklearn.classification.AutoSklearnClassifier()#time_left_for_this_task=60*5)
cls.fit(X_train, y_train)

In [None]:
# Printing statistics
print(cls.sprint_statistics())
print(cls.leaderboard())

In [None]:
predictions = cls.predict(X_test)
acc = accuracy_score(y_test, predictions)
print('Accuracy: %.3f' % acc)


# get the best model and its weight
models = cls.get_models_with_weights()
best_model, best_weight  = models[0]

# get information about the best model
print(best_model)
print(best_weight)

In [None]:
print('Pred: ', predictions)
print('Test: ', y_test)

In [None]:
# get the best model and its weight
models = cls.get_models_with_weights()
best_model, best_weight  = models[0]

# get information about the best model
print(best_model)
print(best_weight)


## Confusion matrix


In [None]:
test = set(y_test)
test = sorted(test)

labels = lab.inverse_transform(test)
print(labels)
confusion = confusion_matrix(y_test, predictions, normalize='all')

fig = plt.figure(figsize=(15,10))
ax= plt.subplot()
sns.set(font_scale=1.5) 
sns.heatmap(confusion, annot=True, fmt='.1%', cmap='Blues')
ax.set_xlabel('Predicted labels', fontsize=18)
ax.set_ylabel('True labels', fontsize=18)
ax.set_title('Confusion matrix for Random forest with Auto-Sklearn', fontweight='bold', fontsize=22); 
print(labels)
ax.xaxis.set_ticklabels(labels, fontsize=12)
ax.yaxis.set_ticklabels(labels, fontsize=12)
plt.show()

## Saving top model

In [None]:
pickle.dump(cls, open(save_path + model_name, 'wb'))

# Load model



In [None]:
loaded_model = pickle.load(open(save_path + model_name, 'rb'))
result = loaded_model.score(X_test, y_test)
print('Accuracy', result)