In [57]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sktime.datasets import load_basic_motions

from sklearn.metrics import accuracy_score

X, y = load_basic_motions(return_type='pd-multiindex')

## Format data

In [58]:
X.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5
Unnamed: 0_level_1,timepoints,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,0.079106,0.394032,0.551444,0.351565,0.02397,0.633883
0,1,0.079106,0.394032,0.551444,0.351565,0.02397,0.633883
0,2,-0.903497,-3.666397,-0.282844,-0.095881,-0.319605,0.972131
0,3,1.116125,-0.656101,0.333118,1.624657,-0.569962,1.209171
0,4,1.6382,1.405135,0.393875,1.187864,-0.271664,1.739182


In [59]:
X = X.reset_index(names=['id', 'timepoints'])
X.head()

Unnamed: 0,id,timepoints,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5
0,0,0,0.079106,0.394032,0.551444,0.351565,0.02397,0.633883
1,0,1,0.079106,0.394032,0.551444,0.351565,0.02397,0.633883
2,0,2,-0.903497,-3.666397,-0.282844,-0.095881,-0.319605,0.972131
3,0,3,1.116125,-0.656101,0.333118,1.624657,-0.569962,1.209171
4,0,4,1.6382,1.405135,0.393875,1.187864,-0.271664,1.739182


## Compute features

In [69]:
from tsfresh import extract_features, extract_relevant_features, select_features

X_feat = extract_features(X, column_id='id', column_sort='timepoints')

X_feat = X_feat.dropna(axis=1)

Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 30/30 [00:07<00:00,  3.87it/s]


In [70]:
print(X_feat.shape, y.shape)

(80, 3501) (80,)


In [71]:
X_feat.head()

Unnamed: 0,dim_4__variance_larger_than_standard_deviation,dim_4__has_duplicate_max,dim_4__has_duplicate_min,dim_4__has_duplicate,dim_4__sum_values,dim_4__abs_energy,dim_4__mean_abs_change,dim_4__mean_change,dim_4__mean_second_derivative_central,dim_4__median,...,dim_3__fourier_entropy__bins_3,dim_3__fourier_entropy__bins_5,dim_3__fourier_entropy__bins_10,dim_3__fourier_entropy__bins_100,dim_3__permutation_entropy__dimension_3__tau_1,dim_3__permutation_entropy__dimension_4__tau_1,dim_3__permutation_entropy__dimension_5__tau_1,dim_3__permutation_entropy__dimension_6__tau_1,dim_3__permutation_entropy__dimension_7__tau_1,dim_3__mean_n_absolute_max__number_of_maxima_7
0,0.0,0.0,0.0,1.0,-1.211833,1.553383,0.067714,-0.00035,1.4e-05,-0.010653,...,0.482064,0.798569,1.306161,3.009338,1.703819,2.883721,3.829694,4.317736,4.493485,0.804719
1,0.0,0.0,0.0,1.0,-0.013321,1.326601,0.05437,0.001264,0.0,0.0,...,0.192626,0.192626,0.192626,0.60241,1.597731,2.744861,3.69381,4.294787,4.484304,2.687343
2,0.0,0.0,0.0,1.0,-1.720537,2.960324,0.084609,-8.1e-05,0.0,-0.01598,...,0.329286,0.413917,0.545824,1.783263,1.716592,2.724103,3.640944,4.178235,4.419746,0.603825
3,0.0,1.0,0.0,1.0,-0.295639,1.891844,0.075193,-0.001426,-5.4e-05,0.003995,...,0.26116,0.329286,0.329286,1.572172,1.714921,2.910144,3.837495,4.320396,4.469556,0.526587
4,0.0,0.0,0.0,1.0,0.972129,0.222901,0.029432,-5.4e-05,0.000217,0.010653,...,0.192626,0.356468,0.451359,2.044414,1.701245,2.791571,3.680559,4.230909,4.449241,0.170456


## Train model using all features

In [72]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

X_train, X_test, y_train, y_test = train_test_split(X_feat, y, test_size=0.5, shuffle=False)

X_train.shape

knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(X_train, y_train)

preds = knn.predict(X_test)

knn_baseline_accuracy = round(accuracy_score(y_test, preds)*100,2)

print(knn_baseline_accuracy)

100.0


## Filter features

In [74]:
relevant_features = set()

for label in np.unique(y):
    y_train_binary = y_train == label
    X_train_filtered = select_features(X_train, y_train_binary)
    
    print(f"Relevant features for class {label}: {X_train_filtered.shape[1]}/{X_train.shape[1]}")
    
    relevant_features = relevant_features.union(set(X_train_filtered.columns))

Number of relevant features for class badminton: 703/3501
Number of relevant features for class running: 425/3501
Number of relevant features for class standing: 968/3501
Number of relevant features for class walking: 51/3501


In [76]:
filtered_X_train = X_train[list(relevant_features)]
filtered_X_test = X_test[list(relevant_features)]

In [77]:
knn.fit(filtered_X_train, y_train)

preds = knn.predict(filtered_X_test)

knn_selected_accuracy = round(accuracy_score(y_test, preds)*100,2)

print(knn_selected_accuracy)

100.0


## Another way of filtering features for multiclass scenarios

In [78]:
# The parameter n_significant specifies the number of classes for which a feature
# must be significant to be kept

X_train_filtered_multi = select_features(X_train, y_train, multiclass=True, n_significant=3)
X_test_filtered_multi = X_test[X_train_filtered_multi.columns]

knn.fit(X_train_filtered_multi, y_train)

preds = knn.predict(X_test_filtered_multi)

knn_selected_accuracy = round(accuracy_score(y_test, preds)*100,2)

print(knn_selected_accuracy)

47.5
