In [1]:
import os
import warnings

warnings.filterwarnings('ignore')

import autosklearn.classification
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from datetime import datetime
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree

pd.options.display.max_columns = 999
RANDOM_STATE = 42


MODEL_DIR = os.path.join(
    'results',
    f"automl-feature-reduction-{datetime.strftime(datetime.now(), '%Y-%m-%d-%H:%M:%S')}"
)

os.makedirs(MODEL_DIR)

MODEL_PATH = os.path.join(MODEL_DIR, 'model.joblib')
DATA_FI_PATH = os.path.join(MODEL_DIR, 'data_fi.csv')
DATA_PATH = 'data/final_train.csv'

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

In [3]:
df = pd.read_csv(DATA_PATH, index_col=0)

x, y = df.drop(columns = ['Activity']), df['Activity']
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=RANDOM_STATE)

In [4]:
inputer = SimpleImputer(strategy='median')

x_train_t = inputer.fit_transform(x_train)
x_test_t = inputer.transform(x_test)

model = LogisticRegression(penalty='l1', solver='liblinear', C = 0.01)
model.fit(x_train_t, y_train)

LogisticRegression(C=0.01, penalty='l1', solver='liblinear')

In [5]:
y_train_hat = model.predict(x_train_t)
y_test_hat = model.predict(x_test_t)

In [6]:
print('Train results')
print(classification_report(y_train, y_train_hat))
print(confusion_matrix(y_train, y_train_hat))
print('-'*40)

Train results
                    precision    recall  f1-score   support

            LAYING       0.99      1.00      0.99      1650
           SITTING       0.90      0.84      0.87      1479
          STANDING       0.86      0.91      0.89      1500
           WALKING       0.92      0.96      0.94      1417
WALKING_DOWNSTAIRS       0.94      0.93      0.94      1047
  WALKING_UPSTAIRS       0.96      0.92      0.94      1221

          accuracy                           0.93      8314
         macro avg       0.93      0.93      0.93      8314
      weighted avg       0.93      0.93      0.93      8314

[[1649    1    0    0    0    0]
 [  20 1238  214    0    2    5]
 [   0  130 1370    0    0    0]
 [   0    0    0 1367   18   32]
 [   0    0    0   67  974    6]
 [   0    0    0   57   41 1123]]
----------------------------------------


In [7]:
print('Test results')
print(classification_report(y_test, y_test_hat))
print(confusion_matrix(y_test, y_test_hat))
print('-'*40)

Test results
                    precision    recall  f1-score   support

            LAYING       0.99      0.99      0.99       517
           SITTING       0.92      0.83      0.87       540
          STANDING       0.84      0.91      0.88       479
           WALKING       0.92      0.95      0.94       483
WALKING_DOWNSTAIRS       0.94      0.93      0.93       341
  WALKING_UPSTAIRS       0.94      0.93      0.94       412

          accuracy                           0.92      2772
         macro avg       0.93      0.92      0.92      2772
      weighted avg       0.92      0.92      0.92      2772

[[514   2   0   0   1   0]
 [  5 448  84   0   0   3]
 [  0  39 438   1   0   1]
 [  0   0   0 459   7  17]
 [  0   0   0  22 316   3]
 [  0   0   0  16  11 385]]
----------------------------------------


In [8]:
fi = pd.DataFrame(model.coef_, columns=x.columns)
fi = fi.replace({0.0: np.nan})
fi = fi.dropna(how = 'all', axis = 'columns')

INFO:numexpr.utils:NumExpr defaulting to 8 threads.


In [9]:
fi

Unnamed: 0,"angle(tBodyGyroMean,gravityMean)",energy-mean(),"fBodyAcc-bandsEnergy()-1,16","fBodyAcc-bandsEnergy()-1,24.1",fBodyAcc-entropy()-X,fBodyAcc-entropy()-Y,fBodyAcc-kurtosis()-X,fBodyAcc-max()-Y,fBodyAccJerk-entropy()-Y,fBodyAccJerk-meanFreq()-Z,fBodyAccJerk-std()-X,fBodyAccJerk-std()-Y,fBodyAccMag-mad(),fBodyAccMag-skewness(),fBodyBodyAccJerkMag-entropy(),fBodyBodyAccJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroMag-maxInds,"fBodyGyro-bandsEnergy()-1,8.2",fBodyGyro-entropy()-X,fBodyGyro-kurtosis()-Y,fBodyGyro-maxInds-Z,fBodyGyro-meanFreq()-X,fBodyGyro-meanFreq()-Y,fBodyGyro-skewness()-X,fBodyGyro-skewness()-Z,shadow-gravity-angle(),subject,"tBodyAcc-arCoeff()-X,1",tBodyAcc-sma(),tBodyAccJerk-entropy()-X,tBodyAccJerk-iqr()-Y,tBodyAccJerk-min()-Y,tBodyAccJerkMag-entropy(),tBodyAccJerkMag-mad(),tBodyAccMag-std(),"tBodyGyro-arCoeff()-X,4",tBodyGyro-low()-X,tBodyGyro-low()-Y,tBodyGyro-low()-Z,tBodyGyro-max()-X,tBodyGyro-std()-Y,"tBodyGyroJerk-arCoeff()-Z,3",tBodyGyroMag-entropy(),"tGravityAcc-arCoeff()-Y,4","tGravityAcc-arCoeff()-Z,3","tGravityAcc-correlation()-X,Y","tGravityAcc-correlation()-X,Z","tGravityAcc-correlation()-Y,Z",tGravityAcc-entropy()-Y,tGravityAcc-max()-X,tGravityAcc-mean()-X,tGravityAcc-mean()-Y,tGravityAcc-mean()-Z,tGravityAccMag-arCoeff()1,tGravityAccMag-mad(),tGravityAccMag-std()
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.028136,,,,,,,,,,,,,,,,,,,,,,,1.424813,1.787104,,-0.302641,,,
1,-0.047041,,,,,,,,,0.236892,,,,,0.075998,,,-0.022917,,,1.218207,,,,,0.324975,-0.241673,,-0.018777,,,,0.061683,,,,,,,-0.000307,,,,,,,0.413381,-0.360816,0.11264,,-0.081141,-0.814298,-1.249621,-1.182444,-0.169856,,,
2,,0.002191,,,0.289963,,,,-0.001063,-0.053526,0.01049,,,-0.062474,,,,,,,,0.067134,,-0.690046,,-0.082276,0.200629,3.5e-05,-0.003646,-0.348295,0.119308,,,,0.319342,0.797445,,-0.060341,,,-0.001811,,,,-0.315297,,,0.351159,-0.089798,-4.4e-05,,,-0.872178,1.759474,0.112155,,0.000258,
3,,,1.322076,,,-0.141504,,,,,,0.88433,,,,-0.134742,-0.000143,-0.018383,-0.417714,,-0.217574,,-0.071145,,,0.00372,,0.038473,0.027511,,,-0.456884,-0.522378,0.125578,-0.553428,,,,,,,,-0.171818,-0.066372,-0.632589,0.34186,,,-0.030293,0.08237,,,,,,-0.450431,,0.72603
4,-0.138632,,-0.722678,0.127981,,,-0.052437,,,,,,-0.190777,,,-0.012419,,,,,,0.052974,,,,-0.084561,0.653671,0.069691,0.024368,,,,,,,,-0.227833,,,,-0.039018,,0.091063,,,,,,,,,,,,,0.124115,,-1.410105
5,0.129598,0.013241,,,-0.062185,,-0.008429,-0.215691,,0.88297,,,,,,0.274506,,,,-0.127632,,-0.330328,,,-0.004167,-0.117194,-0.479498,-0.155307,-0.02221,0.068475,-0.039471,-0.796477,,,,,,,0.0147,,0.083904,-0.06257,-0.067756,,,-0.910108,,,0.005413,,,,,,0.43409,,,


In [10]:
fi.shape

(6, 58)

In [11]:
x_selected = x[fi.columns]

pd.concat([pd.DataFrame(y, columns = [y.name]), x_selected], axis = 1).to_csv(DATA_FI_PATH, index=False)

In [12]:
df_fi = pd.read_csv(DATA_FI_PATH)

x, y = df_fi.drop(columns = ['Activity']), df_fi['Activity']
x_train_s, x_test_s, y_train_s, y_test_s = train_test_split(x, y, random_state=RANDOM_STATE)

In [13]:
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task = 120,
    ensemble_size=1
)
automl.fit(x_train_s, y_train_s)

joblib.dump(automl, MODEL_PATH)

y_train_hat = automl.predict(x_train_s)
y_test_hat = automl.predict(x_test_s)

In [14]:
print('Train results')
print(classification_report(y_train_s, y_train_hat))
print(confusion_matrix(y_train_s, y_train_hat))
print('-'*40)

Train results
                    precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00      1650
           SITTING       0.98      0.99      0.99      1479
          STANDING       0.99      0.98      0.99      1500
           WALKING       0.99      1.00      1.00      1417
WALKING_DOWNSTAIRS       0.99      0.99      0.99      1047
  WALKING_UPSTAIRS       1.00      1.00      1.00      1221

          accuracy                           0.99      8314
         macro avg       0.99      0.99      0.99      8314
      weighted avg       0.99      0.99      0.99      8314

[[1650    0    0    0    0    0]
 [   0 1463   15    0    0    1]
 [   0   23 1477    0    0    0]
 [   0    0    0 1413    4    0]
 [   0    0    0    6 1037    4]
 [   0    0    0    2    4 1215]]
----------------------------------------


In [15]:
print('Test results')
print(classification_report(y_test_s, y_test_hat))
print(confusion_matrix(y_test_s, y_test_hat))
print('-'*40)

Test results
                    precision    recall  f1-score   support

            LAYING       1.00      1.00      1.00       517
           SITTING       0.94      0.95      0.95       540
          STANDING       0.94      0.94      0.94       479
           WALKING       0.99      0.99      0.99       483
WALKING_DOWNSTAIRS       0.97      0.99      0.98       341
  WALKING_UPSTAIRS       1.00      0.97      0.98       412

          accuracy                           0.97      2772
         macro avg       0.97      0.97      0.97      2772
      weighted avg       0.97      0.97      0.97      2772

[[517   0   0   0   0   0]
 [  0 512  28   0   0   0]
 [  0  30 449   0   0   0]
 [  0   0   0 480   3   0]
 [  0   0   0   1 338   2]
 [  0   0   0   5   6 401]]
----------------------------------------


In [16]:
automl.get_models_with_weights()

[(1.0,
  SimpleClassificationPipeline({'balancing:strategy': 'none', 'classifier:__choice__': 'random_forest', 'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer', 'data_preprocessing:numerical_transformer:imputation:strategy': 'mean', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'standardize', 'feature_preprocessor:__choice__': 'no_preprocessing', 'classifier:random_forest:bootstrap': 'True', 'classifier:random_forest:criterion': 'gini', 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:max_features': 0.5, 'classifier:random_forest:max_leaf_nodes': 'None', 'classifier:random_forest:min_impurity_decrease': 0.0, 'classifier:random_forest:min_samples_leaf': 1, 'classifier:random_forest:min_samples_split': 2, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, 'data_preprocessing:categorical_transfo