In [1]:
import os
import warnings

warnings.filterwarnings('ignore')

import autosklearn.classification
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from datetime import datetime
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree

pd.options.display.max_columns = 999
RANDOM_STATE = 42


MODEL_DIR = os.path.join(
    'results',
    f"automl-decision-tree-{datetime.strftime(datetime.now(), '%Y-%M-%d-%H:%M:%S')}"
)

os.makedirs(MODEL_DIR)

MODEL_PATH = os.path.join(MODEL_DIR, 'model.joblib')
PLOT_PATH = os.path.join(MODEL_DIR, 'tree.pdf')
PLOT_FI_PATH = os.path.join(MODEL_DIR, 'fi.pdf')
DATA_PATH = 'data/final_train.csv'

In [3]:
df = pd.read_csv(DATA_PATH, index_col=0)

x, y = df.drop(columns = ['Activity', 'subject', 'void()']), df['Activity']
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=RANDOM_STATE)

automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task = 60*60*8, 
    include_estimators = ['decision_tree'],
    ensemble_nbest=1,
    n_jobs=-1
)
automl.fit(x_train, y_train)

joblib.dump(automl, MODEL_PATH)

y_train_hat = automl.predict(x_train)
y_test_hat = automl.predict(x_test)

TypeError: __init__() got an unexpected keyword argument 'include_estimators'

In [None]:
print('Train results')
print(classification_report(y_train, y_train_hat))
print(confusion_matrix(y_train, y_train_hat))
print('-'*40)

In [None]:
print('Test results')
print(classification_report(y_test, y_test_hat))
print(confusion_matrix(y_test, y_test_hat))
print('-'*40)

In [None]:
pipeline = automl.get_models_with_weights()[0][1]
model = pipeline.named_steps['classifier'].choice.estimator

In [None]:
fig, ax = plt.subplots(figsize=(60, 30))

plot_tree(
    model, 
    fontsize=10, 
    feature_names=x_train.columns, 
    class_names=automl.classes_,
    impurity=False,
    proportion=True
)

plt.savefig(PLOT_PATH)

In [None]:
selector = pipeline.named_steps['feature_preprocessor'].choice.get_preprocessor()

In [None]:
df = pd.DataFrame({
    'column': x.columns[selector.get_support()], 
    'feature-importance': model.feature_importances_
})
df = df.replace({0.0: np.nan}).dropna()

In [None]:
df.sort_values('feature-importance', ascending=False).plot(kind = 'bar', y = 'feature-importance', x = 'column', rot = 90, figsize = (16, 10))
plt.savefig(PLOT_FI_PATH)