In [1]:
%matplotlib widget

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from pathlib import Path

import lightgbm as lgb

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score, precision_score, recall_score

from PfyMU.gait.train_classifier.core import load_datasets
from PfyMU.features import *

plt.style.use('ggplot')

In [3]:
steps = {
    'jumping-rope': 0.15,
    'stairs-descending': 0.1,
    'stairs-ascending': 0.1,
    'jumping': 0.15,
    'lying': 0.15,
    'elevator-ascending': 0.15,
    'elevator-descending': 0.15,
    'running': 0.075,
    'sweeping': 0.15,
    'standing': 225,
    'running-treadmill': 0.1,
    'cycling-50W': 0.12,
    'cycling-100W': 0.12,
    'walking-left': 0.2,
    'walking-right': 0.2,
    'walking-impaired': 0.2,
    'walking': 0.25,
    'sitting': 400,
    'default': 0.5
}

steps = {
    'walking': 0.4,
    'walking-impaired': 0.2,
    'sitting': 900,
    'standing': 300,
    'stairs-ascending': 0.3,
    'stairs-descending': 0.3,
    'cycling-50W': 0.3,
    'cycling-100W': 0.3,
    'default': 1.0
}

In [4]:
# gait_sets_path = Path('/Users/adamol/Documents/Datasets/gait/processed')
gait_sets_path = Path('/home/lukasadamowicz/Documents/Datasets/processed')

datasets = [
    gait_sets_path / 'bluesky2',
    gait_sets_path / 'daliac',
    gait_sets_path / 'ltmm',
    gait_sets_path / 'usc-had'
]

X, Y, subjects, activities = load_datasets(
    datasets, 
    goal_fs=50.0, 
    acc_mag=True, 
    window_length=3.0, 
    window_step=steps
)

# make stair-climbing in the positive class
mask = (activities == 'stairs-ascending') | (activities == 'stairs-descending')
Y_inc_str = Y.copy()
Y_inc_str[mask] = 1

In [5]:
print('Total samples (3.0s windows): ', Y.size)
print('Total walking samples: ', Y.sum())
print('Total non-walking samples: ', Y.size - Y.sum(), '\n')
print(f'% walking samples: {Y.sum() / Y.size * 100:.2f}')

Total samples (3.0s windows):  43163
Total walking samples:  20518
Total non-walking samples:  22645 

% walking samples: 47.54


In [6]:
unq_act, act_ct = np.unique(activities, return_counts=True)
N = np.sum(act_ct)
si = np.argsort(act_ct)
for a, c in zip(unq_act[si], act_ct[si]):
    print(f'{a:25s}: {c:5d} / {N:5d}{c/N:8.2f}')

sit-to-stand             :    16 / 43163    0.00
standing-assisted        :   207 / 43163    0.00
jumping-rope             :   212 / 43163    0.00
jumping                  :   311 / 43163    0.01
vacuuming                :   376 / 43163    0.01
lying                    :   378 / 43163    0.01
elevator-descending      :   475 / 43163    0.01
elevator-ascending       :   491 / 43163    0.01
running                  :   541 / 43163    0.01
sweeping                 :   612 / 43163    0.01
running-treadmill        :   755 / 43163    0.02
washing-dishes           :   776 / 43163    0.02
walking-left             :   787 / 43163    0.02
walking-right            :   842 / 43163    0.02
sleeping                 :  1126 / 43163    0.03
stairs-descending        :  2477 / 43163    0.06
cycling-50W              :  2509 / 43163    0.06
cycling-100W             :  2515 / 43163    0.06
stairs-ascending         :  2763 / 43163    0.06
standing                 :  2953 / 43163    0.07
sitting             

In [7]:
sa_df = pd.DataFrame(data={'Subject': subjects, 'Activity': activities})
sa_df['col1'] = 1.0

# get the subjects for which LOSO actually makes sense: those with multiple activities (ie more than just walking)
gbc = sa_df.groupby(['Subject', 'Activity'], as_index=False).count()
loso_subjects = [i for i in gbc.Subject.unique() if gbc.loc[gbc.Subject == i].shape[0] > 3]

random.seed(184751029)  # fix the generation so that its the same every time
random.shuffle(loso_subjects)

training_masks = []
validation_masks = []
testing_masks = []

# for sub in np.unique(subjects):
#     training_masks.append(np.array(subjects) != sub)
#     validation_masks.append(np.array(subjects) == sub)

for i in range(0, len(loso_subjects), 4):
    tr_m = np.ones(sa_df.shape[0], dtype='bool')
    v_m = np.zeros(sa_df.shape[0], dtype='bool')
    te_m = np.zeros(sa_df.shape[0], dtype='bool')
    
    for j in range(4):
        tr_m &= (sa_df.Subject != loso_subjects[i+j]).values
    for j in range(2):
        v_m |= (sa_df.Subject == loso_subjects[i+j]).values
    for j in range(2):
        te_m |= (sa_df.Subject == loso_subjects[i+j+2]).values
    
    training_masks.append(tr_m)
    validation_masks.append(v_m)
    testing_masks.append(te_m)

In [8]:
FB = Bank(window_length=None, window_step=None)

# add features
FB + Mean()
FB + MeanCrossRate()
FB + StdDev()
FB + Skewness()
FB + Kurtosis()
FB + Range()
FB + IQR()
FB + RMS()
FB + LinearSlope()
FB + SignalEntropy()
FB + SPARC()
FB + ComplexityInvariantDistance(normalize=True)
FB + JerkMetric(normalize=True)
FB + DimensionlessJerk(log=True, signal_type='acceleration')

FB + Autocorrelation(lag=1, normalize=True)
FB + Autocorrelation(lag=15, normalize=True)
FB + Autocorrelation(lag=14, normalize=True)
FB + Autocorrelation(lag=12, normalize=True)

FB + SampleEntropy(m=4, r=1.0)
FB + SampleEntropy(m=2, r=0.75)
FB + SampleEntropy(m=3, r=0.75)
FB + SampleEntropy(m=2, r=0.5)
FB + SampleEntropy(m=2, r=0.25)

FB + PermutationEntropy(order=3, delay=1, normalize=True)
FB + PermutationEntropy(order=5, delay=1, normalize=True)
FB + PermutationEntropy(order=8, delay=1, normalize=True)
FB + PermutationEntropy(order=10, delay=1, normalize=True)
FB + PermutationEntropy(order=8, delay=2, normalize=True)
FB + PermutationEntropy(order=8, delay=8, normalize=True)

FB + RangeCountPercentage(range_min=0, range_max=1.0)
FB + RangeCountPercentage(range_min=0.5, range_max=1.4)
FB + RangeCountPercentage(range_min=0.3, range_max=1.4)
FB + RangeCountPercentage(range_min=1, range_max=1.4)
FB + RangeCountPercentage(range_min=0, range_max=1.5)

FB + RatioBeyondRSigma(r=1.0)
FB + RatioBeyondRSigma(r=2.5)
FB + RatioBeyondRSigma(r=0.5)

FB + DominantFrequency(low_cutoff=0.25, high_cutoff=5.0)
FB + DominantFrequency(low_cutoff=1.0, high_cutoff=3.5)
FB + DominantFrequency(low_cutoff=1.0, high_cutoff=3.0)
FB + DominantFrequency(low_cutoff=1.5, high_cutoff=6.0)
FB + DominantFrequency(low_cutoff=0.5, high_cutoff=3.0)

FB + DominantFrequencyValue(low_cutoff=0.25, high_cutoff=5.0)
FB + DominantFrequencyValue(low_cutoff=1.0, high_cutoff=3.5)
FB + DominantFrequencyValue(low_cutoff=1.0, high_cutoff=3.0)
FB + DominantFrequencyValue(low_cutoff=1.5, high_cutoff=6.0)
FB + DominantFrequencyValue(low_cutoff=0.5, high_cutoff=3.0)

FB + PowerSpectralSum(low_cutoff=0.25, high_cutoff=5.0)
FB + PowerSpectralSum(low_cutoff=1.0, high_cutoff=3.0)
FB + PowerSpectralSum(low_cutoff=1.5, high_cutoff=3.5)
FB + PowerSpectralSum(low_cutoff=0.25, high_cutoff=4.0)
FB + PowerSpectralSum(low_cutoff=0.25, high_cutoff=3.0)

FB + SpectralFlatness(low_cutoff=0.25, high_cutoff=5.0)
FB + SpectralFlatness(low_cutoff=0.0, high_cutoff=6.0)
FB + SpectralFlatness(low_cutoff=0.0, high_cutoff=8.0)
FB + SpectralFlatness(low_cutoff=0.0, high_cutoff=3.5)
FB + SpectralFlatness(low_cutoff=0.5, high_cutoff=3.5)

FB + SpectralEntropy(low_cutoff=0.25, high_cutoff=5.0)
FB + SpectralEntropy(low_cutoff=0.0, high_cutoff=5.0)
FB + SpectralEntropy(low_cutoff=0.0, high_cutoff=3.5)
FB + SpectralEntropy(low_cutoff=0.25, high_cutoff=3.0)
FB + SpectralEntropy(low_cutoff=1.5, high_cutoff=4.0)

FB + DetailPower(wavelet='coif4', freq_band=[1.0, 3.0])

FB + DetailPowerRatio(wavelet='coif4', freq_band=[1.0, 3.0])

In [9]:
X_feat, feature_names = FB.compute(X, fs=50.0, windowed=True, columns=[''])



In [10]:
feats = pd.DataFrame(
    data=X_feat,
    columns=feature_names,
    dtype='float'
)
labels = Y
labels_istrs = Y_inc_str

# Boruta

In [11]:
from sklearn.ensemble import RandomForestClassifier as RFC
import lightgbm as lgb
from boruta import BorutaPy

In [18]:
rf = RFC(max_depth=5, random_state=30951)

rfbor = BorutaPy(
    estimator=rf,
    n_estimators=20, 
    max_iter=100,
    verbose=1
)

rfbor.fit(X_feat[training_masks[0]], Y[training_masks[0]]);

Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 5 / 100
Iteration: 6 / 100
Iteration: 7 / 100
Iteration: 8 / 100
Iteration: 9 / 100
Iteration: 10 / 100
Iteration: 11 / 100
Iteration: 12 / 100
Iteration: 13 / 100
Iteration: 14 / 100
Iteration: 15 / 100
Iteration: 16 / 100
Iteration: 17 / 100
Iteration: 18 / 100
Iteration: 19 / 100
Iteration: 20 / 100
Iteration: 21 / 100
Iteration: 22 / 100
Iteration: 23 / 100
Iteration: 24 / 100
Iteration: 25 / 100
Iteration: 26 / 100
Iteration: 27 / 100
Iteration: 28 / 100
Iteration: 29 / 100
Iteration: 30 / 100
Iteration: 31 / 100
Iteration: 32 / 100
Iteration: 33 / 100
Iteration: 34 / 100
Iteration: 35 / 100
Iteration: 36 / 100
Iteration: 37 / 100
Iteration: 38 / 100
Iteration: 39 / 100
Iteration: 40 / 100
Iteration: 41 / 100
Iteration: 42 / 100
Iteration: 43 / 100
Iteration: 44 / 100
Iteration: 45 / 100
Iteration: 46 / 100
Iteration: 47 / 100
Iteration: 48 / 100
Iteration: 49 / 100
Iteration: 50 / 100
Iteration

In [14]:
green_area = np.array(feature_names)[rfbor.support_]
blue_area = np.array(feature_names)[rfbor.support_weak_]

print('features in the green area:')
for i, g in enumerate(green_area):
    print(i, g)
    
print('\nfeatures in the blue area:', blue_area)

features in the green area:
0 mean
1 meancrossrate
2 stddev
3 skewness
4 kurtosis
5 range
6 iqr
7 rms
8 linearslope
9 signalentropy
10 sparc_4_10.00_0.05
11 complexityinvariantdistance_True
12 jerkmetric
13 dimensionlessjerk_True_acceleration
14 autocorrelation_1_True
15 autocorrelation_15_True
16 autocorrelation_14_True
17 autocorrelation_12_True
18 sampleentropy_4_1.00
19 sampleentropy_2_0.75
20 sampleentropy_3_0.75
21 sampleentropy_2_0.50
22 sampleentropy_2_0.25
23 permutationentropy_3_1_True
24 permutationentropy_5_1_True
25 permutationentropy_8_1_True
26 permutationentropy_10_1_True
27 permutationentropy_8_2_True
28 permutationentropy_8_8_True
29 rangecountpercentage_0_1.00
30 rangecountpercentage_0.50_1.40
31 rangecountpercentage_0.30_1.40
32 rangecountpercentage_1_1.40
33 rangecountpercentage_0_1.50
34 ratiobeyondrsigma_1.00
35 ratiobeyondrsigma_2.50
36 ratiobeyondrsigma_0.50
37 dominantfrequency_0.25_5.00
38 dominantfrequency_1.00_3.50
39 dominantfrequency_1.00_3.00
40 dominant

In [15]:
rfbor.ranking_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])