# tsfresh's feature extraction

Notebook with feature extraction provided by _tsfresh library_ for use it to train prediction models.
Cuaderno con la extracción de características que proporciona la librería tsfresh para su uso en el entrenamiento de modelos de predicción.


100 time-series dataframe with energy consumption every 5 minutes of 100 different commercial locations will be used to clasify the sub-industry to which each trade belongs.

Following tasks will be done:
1. Extract all time-series features (754)
2. Training different models with the feature matrix extracted to predict the subindustry of some sites.
3. Extract 28 specific time-series features.
4. Training different models with the feature matrix extracted to predict the subindustry of some sites.
5. Select relevant features from all features extracted for classification tasks.
6. Training different models with the feature matrix extracted to predict the subindustry of some sites.

In [1]:
from khiva.dimensionality import *
from khiva.library import *
from khiva.array import *
import pandas as pd
import time
import matplotlib.pyplot as plt
from ipywidgets import IntSlider, SelectionSlider, interact
import warnings 
warnings.filterwarnings('ignore')

%config IPCompleter.greedy=True
%matplotlib inline
plt.rcParams['figure.figsize'] = [16, 5]

 We load the dataset and convert it into one option that _tsfresh library_ offers to specify the time series data to be used in the feature extraction function.
https://tsfresh.readthedocs.io/en/latest/text/data_formats.html 

In [2]:
all_sites = pd.read_csv("../data_exploration/energy/data/data-enerNoc/all-data/meta/all_sites.csv")
sites_id = all_sites['SITE_ID'].values
sites_id

array([  6,   8,   9,  10,  12,  13,  14,  21,  22,  25,  29,  30,  31,
        32,  36,  41,  42,  44,  45,  49,  51,  55,  56,  65,  78,  88,
        92,  99, 100, 101, 103, 109, 111, 116, 136, 137, 144, 153, 186,
       197, 213, 214, 217, 218, 224, 228, 236, 259, 270, 275, 281, 285,
       304, 339, 341, 363, 366, 384, 386, 391, 399, 400, 401, 404, 427,
       454, 455, 472, 474, 475, 478, 484, 492, 496, 512, 648, 654, 673,
       674, 690, 697, 703, 716, 718, 731, 737, 742, 744, 745, 755, 761,
       765, 766, 767, 771, 786, 805, 808, 832, 887], dtype=int64)

In [3]:
dataframe = pd.DataFrame()
cont = 0
for i in sites_id:
    df = pd.read_csv('../data_exploration/energy/data/data-enerNoc/all-data/csv/%d.csv' % i)
    data = df[:3000]
    data.reset_index(inplace=True)
    data = data.rename(columns={"index": "time"})
    data = data.drop(list(('timestamp', 'dttm_utc', 'anomaly', 'estimated')), axis=1)
    data['index']=np.full(3000, cont)
    frames=[dataframe, data]
    dataframe= pd.concat(frames)
    cont+=1


In [4]:
dataframe.tail()

Unnamed: 0,time,value,index
2995,2995,50.9889,99
2996,2996,51.5618,99
2997,2997,52.1347,99
2998,2998,53.2805,99
2999,2999,55.5722,99


# 1. Classification on all the features extracted

We use _tsfresh_ to extract all features

In [5]:
from tsfresh import extract_features

start = time.time()

X = extract_features(dataframe, column_id='index', column_sort='time')

print("Time to extract the features : " + str(time.time() - start) + " seconds.")

Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 25/25 [11:52<00:00, 28.49s/it]


Time to extract the features : 713.1246609687805 seconds.


In [5]:
#X.to_csv('754_features.csv')
X = pd.read_csv("754_features.csv")


In [6]:
X.tail()

Unnamed: 0,id,value__abs_energy,value__absolute_sum_of_changes,"value__agg_autocorrelation__f_agg_""mean""__maxlag_40","value__agg_autocorrelation__f_agg_""median""__maxlag_40","value__agg_autocorrelation__f_agg_""var""__maxlag_40","value__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""intercept""","value__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""rvalue""","value__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""slope""","value__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""stderr""",...,value__symmetry_looking__r_0.9,value__symmetry_looking__r_0.9500000000000001,value__time_reversal_asymmetry_statistic__lag_1,value__time_reversal_asymmetry_statistic__lag_2,value__time_reversal_asymmetry_statistic__lag_3,value__value_count__value_-1,value__value_count__value_0,value__value_count__value_1,value__variance,value__variance_larger_than_standard_deviation
95,95,169220000.0,19505.5733,0.49302,0.444073,0.017527,259.695018,-0.436424,-0.090913,0.010857,...,1.0,1.0,-2919.373163,-5671.586137,-8035.872946,0.0,0.0,0.0,488.712955,1.0
96,96,299086.1,1546.8603,0.902067,0.89588,0.001285,6.589746,0.337939,0.021413,0.003455,...,1.0,1.0,0.881082,1.411394,1.805925,0.0,0.0,0.0,26.3078,1.0
97,97,208729.7,2341.2617,0.843913,0.856522,0.006768,8.677254,-0.013005,-0.000695,0.003095,...,1.0,1.0,-0.095115,-0.006596,0.210329,0.0,0.0,0.0,16.561033,1.0
98,98,27075650.0,12878.5865,0.928992,0.929781,0.001578,47.180613,0.344286,0.237675,0.037546,...,1.0,1.0,237.329758,461.401345,699.208698,0.0,0.0,0.0,3289.337288,1.0
99,99,32456330.0,5499.3625,0.858164,0.868876,0.009696,83.421594,0.154652,0.086579,0.03204,...,1.0,1.0,18.410172,-41.891282,-123.219068,0.0,0.0,0.0,2256.806057,1.0


In [7]:
X['value__friedrich_coefficients__m_3__r_30__coeff_0'].fillna((X['value__friedrich_coefficients__m_3__r_30__coeff_0'].mean()), inplace=True)
X['value__friedrich_coefficients__m_3__r_30__coeff_1'].fillna((X['value__friedrich_coefficients__m_3__r_30__coeff_1'].mean()), inplace=True)
X['value__friedrich_coefficients__m_3__r_30__coeff_2'].fillna((X['value__friedrich_coefficients__m_3__r_30__coeff_2'].mean()), inplace=True)
X['value__friedrich_coefficients__m_3__r_30__coeff_3'].fillna((X['value__friedrich_coefficients__m_3__r_30__coeff_3'].mean()), inplace=True)
X['value__max_langevin_fixed_point__m_3__r_30'].fillna((X['value__max_langevin_fixed_point__m_3__r_30'].mean()), inplace=True)

In [8]:
from sklearn.preprocessing import scale
X = X.drop('id', axis=1)
X=scale(X)

## Split the dataset into train and test subsets

In [9]:
from sklearn.preprocessing import scale
y = all_sites["SUB_INDUSTRY"].values

In [10]:
import sklearn
file_names = []
# This is needed to check the accuracy of the predictive modelling step.
for name in all_sites["SITE_ID"].values:
    file_names.append(name)
    
for i in range(15):
    X, y, file_names = sklearn.utils.shuffle(X, y, file_names, random_state=0)

In [11]:
files_test = []
list_test_indices = []
test_files = [92, 45, 761, 10, 766, 400, 673, 49, 144, 496, 731, 281, 213, 197, 399]

for i in range(len(file_names)):
    # choosing always the same examples for testing to ensure the same results ....
    if file_names[i] in test_files:
        list_test_indices.append(i)
        files_test.append(file_names[i])
        
        
print(list_test_indices)

[0, 10, 18, 22, 30, 33, 39, 44, 45, 53, 70, 73, 80, 83, 90]


In [12]:
X_train = np.delete(X, list_test_indices, 0)
X_test = np.take(X, list_test_indices, 0)
y_train = np.delete(y, list_test_indices)
y_test = np.take(y, list_test_indices)

X_train.shape

(85, 754)

We apply classification algorithms to predict some sites.

> ### 1.1. SVC

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

cv_grid_svc = GridSearchCV(SVC(), param_grid = {
    'degree':[3,4],
    'shrinking':[True,False],
    'probability':[True,False]
}, cv=10)

cv_grid_svc.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'degree': [3, 4], 'shrinking': [True, False], 'probability': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [15]:
from sklearn import metrics

bestclassifier_svc=cv_grid_svc.best_estimator_

y_predict_svc = bestclassifier_svc.predict(X_test)
accuracy = metrics.accuracy_score(y_test,y_predict_svc)
print(cv_grid_svc.best_params_)
print('Accuracy of the best classifier after CV is %.3f%%' % (accuracy*100))

{'degree': 3, 'probability': True, 'shrinking': True}
Accuracy of the best classifier after CV is 80.000%


>### 1.2. Random Forest

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

cv_grid_rf = GridSearchCV(estimator = RandomForestClassifier(), param_grid = {
    'n_estimators': range(10, 100, 10),
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}, cv=10)

cv_grid_rf.fit(X_train, y_train)

  from numpy.core.umath_tests import inner1d


GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': range(10, 100, 10), 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [4, 5, 6, 7, 8], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [17]:
from sklearn import metrics

bestclassifier_rf=cv_grid_rf.best_estimator_

y_predict_rf = bestclassifier_rf.predict(X_test)
accuracy = metrics.accuracy_score(y_test,y_predict_rf)
print(cv_grid_rf.best_params_)
print('Accuracy of the best classifier after CV is %.3f%%' % (accuracy*100))

{'criterion': 'gini', 'max_depth': 4, 'max_features': 'log2', 'n_estimators': 80}
Accuracy of the best classifier after CV is 66.667%


>### 1.3. Logistic Regression

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

cv_grid_lr = GridSearchCV(estimator = LogisticRegression(), param_grid = {
    'C': np.logspace(0, 4, 10),
    'penalty': ['l1', 'l2'],
}, cv=10)

cv_grid_lr.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([1.00000e+00, 2.78256e+00, 7.74264e+00, 2.15443e+01, 5.99484e+01,
       1.66810e+02, 4.64159e+02, 1.29155e+03, 3.59381e+03, 1.00000e+04]), 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [19]:
from sklearn import metrics

bestclassifier_lr=cv_grid_lr.best_estimator_

y_predict_lr = bestclassifier_lr.predict(X_test)
accuracy = metrics.accuracy_score(y_test,y_predict_lr)
print(cv_grid_lr.best_params_)
print('Accuracy of the best classifier after CV is %.3f%%' % (accuracy*100))

{'C': 1.0, 'penalty': 'l2'}
Accuracy of the best classifier after CV is 53.333%


>### 1.4. KNN

In [20]:
from sklearn.model_selection import GridSearchCV

cv_grid_knn = GridSearchCV(estimator = KNeighborsClassifier(), param_grid = {
    'n_neighbors':[4,5,6,7],
    'leaf_size':[1,3,5],
    'algorithm':['auto', 'kd_tree']
}, cv=10)

cv_grid_knn.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [4, 5, 6, 7], 'leaf_size': [1, 3, 5], 'algorithm': ['auto', 'kd_tree']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [21]:
from sklearn import metrics

bestclassifier_knn=cv_grid_knn.best_estimator_

y_predict_knn = bestclassifier_knn.predict(X_test)
accuracy = metrics.accuracy_score(y_test,y_predict_knn)
print(cv_grid_knn.best_params_)
print('Accuracy of the best classifier after CV is %.3f%%' % (accuracy*100))

{'algorithm': 'auto', 'leaf_size': 1, 'n_neighbors': 4}
Accuracy of the best classifier after CV is 60.000%


>### 1.5. XGBOOST

In [13]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
import warnings 
warnings.filterwarnings('ignore')

cv_grid_xb = GridSearchCV(estimator = XGBClassifier(), param_grid = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(10, 100, 10),
    'learning_rate': [0.1, 0.01, 0.05]
}, cv=10)

cv_grid_xb.fit(X_train, y_train)


GridSearchCV(cv=10, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': range(2, 10), 'n_estimators': range(10, 100, 10), 'learning_rate': [0.1, 0.01, 0.05]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [14]:
from sklearn import metrics

bestclassifier_xb=cv_grid_xb.best_estimator_

y_predict_xb = bestclassifier_xb.predict(X_test)
accuracy = metrics.accuracy_score(y_test,y_predict_xb)
print(cv_grid_xb.best_params_)
print('Accuracy of the best classifier after CV is %.3f%%' % (accuracy*100))

{'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 70}
Accuracy of the best classifier after CV is 66.667%


# 2.  Classification on the 28 features extracted

We select specific features that will be extracted

In [15]:
fc_parameters = {
    "abs_energy": None,
    'abs_energy': None,
    'absolute_sum_of_changes': None,
    'count_above_mean':None,
    'count_below_mean':None,
    'first_location_of_maximum':None,
    'first_location_of_minimum':None,
    'has_duplicate':None,
    'has_duplicate_max':None,
    'kurtosis':None,
    'last_location_of_maximum':None,
    'last_location_of_minimum':None,
    'has_duplicate_min':None,
    'longest_strike_above_mean':None,
    'longest_strike_below_mean':None,
    'maximum':None,
    'mean_abs_change':None,
    'minimum':None,
    'number_crossing_m':[{"m": 0}],
    'mean':None,
    'median':None,
    'mean_change':None,
    'ratio_value_number_to_time_series_length':None,
    'skewness':None,
    'standard_deviation':None,
    'sum_of_reoccurring_values':None,
    'sum_values':None,
    'variance':None,
    'variance_larger_than_standard_deviation':None
}

In [16]:
from tsfresh import extract_features
start = time.time()

X_tsfresh = extract_features(dataframe, column_id='index', column_sort='time', default_fc_parameters=fc_parameters)

print("Time to extract the features : " + str(time.time() - start) + " seconds.")

Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 25/25 [00:05<00:00,  4.20it/s]


Time to extract the features : 6.751388072967529 seconds.


In [17]:
X_tsfresh.tail()

variable,value__abs_energy,value__absolute_sum_of_changes,value__count_above_mean,value__count_below_mean,value__first_location_of_maximum,value__first_location_of_minimum,value__has_duplicate,value__has_duplicate_max,value__has_duplicate_min,value__kurtosis,...,value__median,value__minimum,value__number_crossing_m__m_0,value__ratio_value_number_to_time_series_length,value__skewness,value__standard_deviation,value__sum_of_reoccurring_values,value__sum_values,value__variance,value__variance_larger_than_standard_deviation
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
95,169220000.0,19505.5733,2028.0,972.0,0.184,0.883333,1.0,1.0,1.0,2.589399,...,245.643,122.8215,0.0,0.006,-1.53874,22.106853,3630.9104,709409.2583,488.712955,1.0
96,299086.1,1546.8603,1716.0,1284.0,0.249667,0.057,1.0,1.0,1.0,-1.441169,...,9.9965,1.2982,0.0,0.045667,-0.086381,5.129113,1345.6332,25699.9653,26.3078,1.0
97,208729.7,2341.2617,920.0,2080.0,0.258,0.654333,1.0,0.0,1.0,0.647844,...,6.0434,2.1976,0.0,0.023333,1.222948,4.069525,754.5973,21843.532,16.561033,1.0
98,27075650.0,12878.5865,1479.0,1521.0,0.439667,0.001667,1.0,1.0,1.0,-1.724007,...,66.509,12.0925,0.0,0.009,0.240556,57.352744,2448.7399,227206.7849,3289.337288,1.0
99,32456330.0,5499.3625,1043.0,1957.0,0.341333,0.101,1.0,1.0,0.0,-1.290202,...,61.8742,39.5307,0.0,0.069,0.735249,47.505853,21101.9585,277592.7504,2256.806057,1.0


In [18]:
y = all_sites["SUB_INDUSTRY"].values
X = X_tsfresh

In [19]:
X = scale(X)

## Split the dataset into train and test subsets

In [20]:
import sklearn
file_names = []
# This is needed to check the accuracy of the predictive modelling step.
for name in all_sites["SITE_ID"].values:
    file_names.append(name)
    
for i in range(15):
    X, y, file_names = sklearn.utils.shuffle(X, y, file_names, random_state=0)

In [21]:
files_test = []
list_test_indices = []
test_files = [92, 45, 761, 10, 766, 400, 673, 49, 144, 496, 731, 281, 213, 197, 399]

for i in range(len(file_names)):
    # choosing always the same examples for testing to ensure the same results ....
    if file_names[i] in test_files:
        list_test_indices.append(i)
        files_test.append(file_names[i])
        
        
print(list_test_indices)

[0, 10, 18, 22, 30, 33, 39, 44, 45, 53, 70, 73, 80, 83, 90]


In [22]:
X_train = np.delete(X, list_test_indices, 0)
X_test = np.take(X, list_test_indices, 0)
y_train = np.delete(y, list_test_indices)
y_test = np.take(y, list_test_indices)

X_train.shape

(85, 28)

>### 2.1. SVC

In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

cv_grid_svc = GridSearchCV(SVC(), param_grid = {
    'degree':[3,4],
    'shrinking':[True,False],
    'probability':[True,False]
})

cv_grid_svc.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'degree': [3, 4], 'shrinking': [True, False], 'probability': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [25]:
from sklearn import metrics

bestclassifier_svc=cv_grid_svc.best_estimator_

y_predict_svc = bestclassifier_svc.predict(X_test)
accuracy = metrics.accuracy_score(y_test,y_predict_svc)
print(cv_grid_svc.best_params_)
print('Accuracy of the best classifier after CV is %.3f%%' % (accuracy*100))

{'degree': 3, 'probability': True, 'shrinking': True}
Accuracy of the best classifier after CV is 80.000%


>### 2.2. Random Forest

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

cv_grid_rf = GridSearchCV(estimator = RandomForestClassifier(), param_grid = {
    'n_estimators': range(10, 100, 10),
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}, cv=10)

cv_grid_rf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': range(10, 100, 10), 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [4, 5, 6, 7, 8], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [28]:
from sklearn import metrics

bestclassifier_rf=cv_grid_rf.best_estimator_

y_predict_rf = bestclassifier_rf.predict(X_test)
accuracy = metrics.accuracy_score(y_test,y_predict_rf)
print(cv_grid_rf.best_params_)
print('Accuracy of the best classifier after CV is %.3f%%' % (accuracy*100))

{'criterion': 'gini', 'max_depth': 4, 'max_features': 'auto', 'n_estimators': 90}
Accuracy of the best classifier after CV is 66.667%


>### 2.3. Logistic Regression

In [29]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

cv_grid_lr = GridSearchCV(estimator = LogisticRegression(), param_grid = {
    'C': np.logspace(0, 4, 10),
    'penalty': ['l1', 'l2'],
}, cv=10)

cv_grid_lr.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([1.00000e+00, 2.78256e+00, 7.74264e+00, 2.15443e+01, 5.99484e+01,
       1.66810e+02, 4.64159e+02, 1.29155e+03, 3.59381e+03, 1.00000e+04]), 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [30]:
from sklearn import metrics

bestclassifier_lr=cv_grid_lr.best_estimator_

y_predict_lr = bestclassifier_lr.predict(X_test)
accuracy = metrics.accuracy_score(y_test,y_predict_lr)
print(cv_grid_lr.best_params_)
print('Accuracy of the best classifier after CV is %.3f%%' % (accuracy*100))

{'C': 2.7825594022071245, 'penalty': 'l1'}
Accuracy of the best classifier after CV is 66.667%


>### 2.4. KNN

In [32]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

cv_grid_knn = GridSearchCV(estimator = KNeighborsClassifier(), param_grid = {
    'n_neighbors':[4,5,6,7],
    'leaf_size':[1,3,5],
    'algorithm':['auto', 'kd_tree']
}, cv=10)

cv_grid_knn.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [4, 5, 6, 7], 'leaf_size': [1, 3, 5], 'algorithm': ['auto', 'kd_tree']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [33]:
from sklearn import metrics

bestclassifier_knn=cv_grid_knn.best_estimator_

y_predict_knn = bestclassifier_knn.predict(X_test)
accuracy = metrics.accuracy_score(y_test,y_predict_knn)
print(cv_grid_knn.best_params_)
print('Accuracy of the best classifier after CV is %.3f%%' % (accuracy*100))

{'algorithm': 'auto', 'leaf_size': 1, 'n_neighbors': 6}
Accuracy of the best classifier after CV is 60.000%


>### 2.5. XGBOOST

In [34]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

cv_grid_xb = GridSearchCV(estimator = XGBClassifier(), param_grid = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(10, 100, 10),
    'learning_rate': [0.1, 0.01, 0.05]
}, cv=10)

cv_grid_xb.fit(X_train, y_train)


GridSearchCV(cv=10, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': range(2, 10), 'n_estimators': range(10, 100, 10), 'learning_rate': [0.1, 0.01, 0.05]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [35]:
from sklearn import metrics

bestclassifier_xb=cv_grid_xb.best_estimator_

y_predict_xb = bestclassifier_xb.predict(X_test)
accuracy = metrics.accuracy_score(y_test,y_predict_xb)
print(cv_grid_xb.best_params_)
print('Accuracy of the best classifier after CV is %.3f%%' % (accuracy*100))

{'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 50}
Accuracy of the best classifier after CV is 80.000%


# 3. Classification on the relevant features extracted

In [5]:
X = pd.read_csv("754_features.csv")

In [6]:
X['value__friedrich_coefficients__m_3__r_30__coeff_0'].fillna((X['value__friedrich_coefficients__m_3__r_30__coeff_0'].mean()), inplace=True)
X['value__friedrich_coefficients__m_3__r_30__coeff_1'].fillna((X['value__friedrich_coefficients__m_3__r_30__coeff_1'].mean()), inplace=True)
X['value__friedrich_coefficients__m_3__r_30__coeff_2'].fillna((X['value__friedrich_coefficients__m_3__r_30__coeff_2'].mean()), inplace=True)
X['value__friedrich_coefficients__m_3__r_30__coeff_3'].fillna((X['value__friedrich_coefficients__m_3__r_30__coeff_3'].mean()), inplace=True)
X['value__max_langevin_fixed_point__m_3__r_30'].fillna((X['value__max_langevin_fixed_point__m_3__r_30'].mean()), inplace=True)

In [7]:
X_dataframe=pd.DataFrame(X)

We use _select_features_ function to select relevant features among all features extracted for classification task

In [10]:
from tsfresh.feature_selection.selection import select_features
X_filtered = select_features(X_dataframe, y)
X_filtered.head()



Unnamed: 0,id,"value__fft_coefficient__coeff_66__attr_""real""","value__fft_coefficient__coeff_58__attr_""imag""","value__fft_coefficient__coeff_20__attr_""abs""","value__fft_coefficient__coeff_66__attr_""abs""","value__fft_coefficient__coeff_35__attr_""imag""","value__fft_coefficient__coeff_62__attr_""imag""","value__fft_coefficient__coeff_67__attr_""abs""","value__fft_coefficient__coeff_98__attr_""abs""","value__fft_coefficient__coeff_10__attr_""abs""",...,value__index_mass_quantile__q_0.6,value__energy_ratio_by_chunks__num_segments_10__segment_focus_5,"value__fft_coefficient__coeff_52__attr_""imag""",value__symmetry_looking__r_0.1,value__count_below_mean,value__count_above_mean,"value__fft_coefficient__coeff_93__attr_""imag""","value__fft_coefficient__coeff_16__attr_""imag""",value__number_cwt_peaks__n_1,"value__fft_coefficient__coeff_16__attr_""angle"""
0,0,222.509129,227.392908,2061.138483,674.378729,81.219038,42.358167,998.488856,527.260712,10752.598658,...,0.598667,0.092192,-795.462125,1.0,1517.0,1483.0,-371.665563,-1028.189683,248.0,-91.230499
1,1,1396.576617,922.347886,8503.897383,1457.846808,1724.899392,2100.598948,1948.672509,1414.768941,18120.494049,...,0.566,0.117585,-762.41723,1.0,1520.0,1480.0,-1641.395094,8124.169478,241.0,72.279473
2,2,-34.819392,-88.152898,507.033012,108.324792,73.820233,49.822441,102.634403,169.103653,10394.141215,...,0.567333,0.120544,276.448999,0.0,1877.0,1123.0,62.417951,299.522734,248.0,114.303808
3,3,2689.561936,4080.327296,9807.569078,2852.117154,6089.347629,2261.647595,851.08435,1765.34915,34391.793717,...,0.620667,0.127519,4855.57301,1.0,1162.0,1838.0,1075.619074,7.273246,246.0,0.160733
4,4,-8.85811,12.304789,972.068243,82.968876,8.748254,-100.865715,135.251746,41.592616,13565.096256,...,0.579333,0.116335,-368.274782,0.0,1886.0,1114.0,-61.761128,-679.368062,251.0,-57.84582


In [11]:
from sklearn.preprocessing import scale
X_selected = X_filtered.drop('id', axis=1)
X_selected = scale(X_selected)

In [12]:
import sklearn
file_names = []
# This is needed to check the accuracy of the predictive modelling step.
for name in all_sites["SITE_ID"].values:
    file_names.append(name)
    
for i in range(15):
    X_selected, y, file_names = sklearn.utils.shuffle(X_selected, y, file_names, random_state=0)
    

In [13]:
#X_selected = X_selected.drop('id', axis=1)
X_selected.shape

(100, 481)

In [14]:
files_test = []
list_test_indices = []
test_files = [92, 45, 761, 10, 766, 400, 673, 49, 144, 496, 731, 281, 213, 197, 399]

for i in range(len(file_names)):
    # choosing always the same examples for testing to ensure the same results ....
    if file_names[i] in test_files:
        list_test_indices.append(i)
        files_test.append(file_names[i])
        
        
print(list_test_indices)

[0, 10, 18, 22, 30, 33, 39, 44, 45, 53, 70, 73, 80, 83, 90]


In [15]:
X_train = np.delete(X_selected, list_test_indices, 0)
X_test = np.take(X_selected, list_test_indices, 0)
y_train = np.delete(y, list_test_indices)
y_test = np.take(y, list_test_indices)

X_train.shape

(85, 481)

>### 3.1. SVC

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

cv_grid_svc = GridSearchCV(SVC(), param_grid = {
    'degree':[3,4],
    'shrinking':[True,False],
    'probability':[True,False]
})

cv_grid_svc.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'degree': [3, 4], 'shrinking': [True, False], 'probability': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [14]:
from sklearn import metrics

bestclassifier_svc=cv_grid_svc.best_estimator_

y_predict_svc = bestclassifier_svc.predict(X_test)
accuracy = metrics.accuracy_score(y_test,y_predict_svc)
print(cv_grid_svc.best_params_)
print("NUMBER OF ERRORS: " + str(sum(y_predict_svc != y_test)))
print("ERROR RATE: " + str(1 - sum(y_predict_svc == y_test) / float(len(y_predict_svc))) + "%")
print('Accuracy of the best classifier after CV is %.3f%%' % (accuracy*100))

{'degree': 3, 'probability': True, 'shrinking': True}
NUMBER OF ERRORS: 3
ERROR RATE: 0.19999999999999996%
Accuracy of the best classifier after CV is 80.000%


>### 3.2. Random Forest

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

cv_grid_rf = GridSearchCV(estimator = RandomForestClassifier(), param_grid = {
    'n_estimators': range(10, 100, 10),
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}, cv=10)

cv_grid_rf.fit(X_train, y_train)

  from numpy.core.umath_tests import inner1d


GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': range(10, 100, 10), 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [4, 5, 6, 7, 8], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [16]:
from sklearn import metrics

bestclassifier_rf=cv_grid_rf.best_estimator_

y_predict_rf = bestclassifier_rf.predict(X_test)
accuracy = metrics.accuracy_score(y_test,y_predict_rf)
print(cv_grid_rf.best_params_)
print("NUMBER OF ERRORS: " + str(sum(y_predict_rf != y_test)))
print("ERROR RATE: " + str(1 - sum(y_predict_rf == y_test) / float(len(y_predict_rf))) + "%")
print('Accuracy of the best classifier after CV is %.3f%%' % (accuracy*100))

{'criterion': 'gini', 'max_depth': 4, 'max_features': 'log2', 'n_estimators': 70}
NUMBER OF ERRORS: 4
ERROR RATE: 0.2666666666666667%
Accuracy of the best classifier after CV is 73.333%


>### 3.3. Logistic Regression

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

cv_grid_lr = GridSearchCV(estimator = LogisticRegression(), param_grid = {
    'C': np.logspace(0, 4, 10),
    'penalty': ['l1', 'l2'],
}, cv=10)

cv_grid_lr.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([1.00000e+00, 2.78256e+00, 7.74264e+00, 2.15443e+01, 5.99484e+01,
       1.66810e+02, 4.64159e+02, 1.29155e+03, 3.59381e+03, 1.00000e+04]), 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [18]:
from sklearn import metrics

bestclassifier_lr=cv_grid_lr.best_estimator_

y_predict_lr = bestclassifier_lr.predict(X_test)
accuracy = metrics.accuracy_score(y_test,y_predict_lr)
print(cv_grid_lr.best_params_)
print("NUMBER OF ERRORS: " + str(sum(y_predict_lr != y_test)))
print("ERROR RATE: " + str(1 - sum(y_predict_lr == y_test) / float(len(y_predict_lr))) + "%")
print('Accuracy of the best classifier after CV is %.3f%%' % (accuracy*100))

{'C': 2.7825594022071245, 'penalty': 'l2'}
NUMBER OF ERRORS: 6
ERROR RATE: 0.4%
Accuracy of the best classifier after CV is 60.000%


>### 3.4. KNN

In [19]:
from sklearn.model_selection import GridSearchCV

cv_grid_knn = GridSearchCV(estimator = KNeighborsClassifier(), param_grid = {
    'n_neighbors':[4,5,6,7],
    'leaf_size':[1,3,5],
    'algorithm':['auto', 'kd_tree']
}, cv=10)

cv_grid_knn.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [4, 5, 6, 7], 'leaf_size': [1, 3, 5], 'algorithm': ['auto', 'kd_tree']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [20]:
from sklearn import metrics

bestclassifier_knn=cv_grid_knn.best_estimator_

y_predict_knn = bestclassifier_knn.predict(X_test)
accuracy = metrics.accuracy_score(y_test,y_predict_knn)
print(cv_grid_knn.best_params_)
print("NUMBER OF ERRORS: " + str(sum(y_predict_knn != y_test)))
print("ERROR RATE: " + str(1 - sum(y_predict_knn == y_test) / float(len(y_predict_knn))) + "%")
print('Accuracy of the best classifier after CV is %.3f%%' % (accuracy*100))

{'algorithm': 'auto', 'leaf_size': 1, 'n_neighbors': 4}
NUMBER OF ERRORS: 5
ERROR RATE: 0.33333333333333337%
Accuracy of the best classifier after CV is 66.667%


>### 3.5. XGBOOST

In [16]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
import warnings 
warnings.filterwarnings('ignore')

cv_grid_xb = GridSearchCV(estimator = XGBClassifier(), param_grid = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(10, 100, 10),
    'learning_rate': [0.1, 0.01, 0.05]
}, cv=10)

cv_grid_xb.fit(X_train, y_train)


GridSearchCV(cv=10, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': range(2, 10), 'n_estimators': range(10, 100, 10), 'learning_rate': [0.1, 0.01, 0.05]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [17]:
from sklearn import metrics

bestclassifier_xb=cv_grid_xb.best_estimator_

y_predict_xb = bestclassifier_xb.predict(X_test)
accuracy = metrics.accuracy_score(y_test,y_predict_xb)
print(cv_grid_xb.best_params_)
print("NUMBER OF ERRORS: " + str(sum(y_predict_xb != y_test)))
print("ERROR RATE: " + str(1 - sum(y_predict_xb == y_test) / float(len(y_predict_xb))) + "%")
print('Accuracy of the best classifier after CV is %.3f%%' % (accuracy*100))

{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 10}
NUMBER OF ERRORS: 5
ERROR RATE: 0.33333333333333337%
Accuracy of the best classifier after CV is 66.667%
