In [1]:
%matplotlib widget

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from pathlib import Path

import lightgbm as lgb

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score, roc_curve, make_scorer
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_validate

from PfyMU.gait.train_classifier.core import load_datasets
from PfyMU.features import *

plt.style.use('ggplot')

In [3]:
from scipy.signal import butter, sosfiltfilt


def mag_filter(x, fs):
    sos = butter(1, 2 * 5 / fs, btype='low', output='sos')
    x_ = np.linalg.norm(x, axis=1)
    return sosfiltfilt(sos, x_) - 1

def mag_band_filter(x, fs):
    sos = butter(1, [2 * 0.25 / fs, 2 * 5 / fs], btype='band', output='sos')
    return sosfiltfilt(sos, np.linalg.norm(x, axis=1))

def lp_filter(x, fs):
    sos = butter(1, [0.25 * 2 / fs, 2 * 5 / fs], btype='band', output='sos')
    return sosfiltfilt(sos, x, axis=0)
    
steps = {
    'walking': 0.4,
    'walking-impaired': 0.2,
    'sitting': 900,
    'standing': 300,
    'stairs-ascending': 0.3,
    'stairs-descending': 0.3,
    'cycling-50W': 0.3,
    'cycling-100W': 0.3,
    'default': 1.0
}

In [4]:
# gait_sets_path = Path('/Users/adamol/Documents/Datasets/gait/processed')
gait_sets_path = Path('/home/lukasadamowicz/Documents/Datasets/processed')

datasets = [
    gait_sets_path / 'bluesky2',
    gait_sets_path / 'daliac',
    gait_sets_path / 'ltmm',
    gait_sets_path / 'usc-had'
]

kwargs = {'paths': datasets, 'goal_fs': 50.0, 'window_step': steps, 'window_length': 3.0}

In [6]:
X, Y, subjects, activities = load_datasets(acc_mag=False, signal_function=lp_filter, **kwargs)

Y[['stair' in i for i in activities]] = 1

In [9]:
X_mf, *_ = load_datasets(acc_mag=False, signal_function=mag_filter, **kwargs)

In [10]:
X_mbf, *_ = load_datasets(acc_mag=False, signal_function=mag_band_filter, **kwargs)

In [7]:
print('Total samples (3.0s windows): ', Y.size)
print('Total walking samples: ', Y.sum())
print('Total non-walking samples: ', Y.size - Y.sum(), '\n')
print(f'% walking samples: {Y.sum() / Y.size * 100:.2f}')

Total samples (3.0s windows):  43163
Total walking samples:  25758
Total non-walking samples:  17405 

% walking samples: 59.68


In [8]:
unq_act, act_ct = np.unique(activities, return_counts=True)
N = np.sum(act_ct)
si = np.argsort(act_ct)
for a, c in zip(unq_act[si], act_ct[si]):
    print(f'{a:25s}: {c:5d} / {N:5d}{c/N:8.2f}')

sit-to-stand             :    16 / 43163    0.00
standing-assisted        :   207 / 43163    0.00
jumping-rope             :   212 / 43163    0.00
jumping                  :   311 / 43163    0.01
vacuuming                :   376 / 43163    0.01
lying                    :   378 / 43163    0.01
elevator-descending      :   475 / 43163    0.01
elevator-ascending       :   491 / 43163    0.01
running                  :   541 / 43163    0.01
sweeping                 :   612 / 43163    0.01
running-treadmill        :   755 / 43163    0.02
washing-dishes           :   776 / 43163    0.02
walking-left             :   787 / 43163    0.02
walking-right            :   842 / 43163    0.02
sleeping                 :  1126 / 43163    0.03
stairs-descending        :  2477 / 43163    0.06
cycling-50W              :  2509 / 43163    0.06
cycling-100W             :  2515 / 43163    0.06
stairs-ascending         :  2763 / 43163    0.06
standing                 :  2953 / 43163    0.07
sitting             

In [11]:
pca = PCA(n_components=3)

for i in range(X.shape[0]):
    X[i, :, :] = pca.fit_transform(X[i, :, :])

In [12]:
random.seed(5)
rnd_subjects = [i for i in np.unique(subjects) if np.unique(activities[subjects==i]).size > 3]
random.shuffle(rnd_subjects)

training_masks, validation_masks, testing_masks = [], [], []

for i in range(0, len(rnd_subjects), 4):
    trm = np.ones(len(subjects), dtype='bool')
    vm = np.zeros_like(trm, dtype='bool')
    tem = np.zeros_like(trm, dtype='bool')
    
    for j in range(4):
        trm &= subjects != rnd_subjects[i + j]
        if j < 2:
            vm |= subjects == rnd_subjects[i + j]
        else:
            tem |= subjects == rnd_subjects[i + j]
    
    training_masks.append(trm)
    validation_masks.append(vm)
    testing_masks.append(tem)

In [13]:
FB = Bank(window_length=None, window_step=None)

# add features
FB + Mean()
FB + MeanCrossRate()
# FB + StdDev()  # highly correlated with RMS
FB + Skewness()
FB + Kurtosis()
FB + Range()
FB + IQR()
FB + RMS()
FB + LinearSlope()
FB + SignalEntropy()
FB + SPARC()
FB + ComplexityInvariantDistance(normalize=True)
FB + JerkMetric(normalize=True)
FB + DimensionlessJerk(log=True, signal_type='acceleration')

FB + Autocorrelation(lag=15, normalize=True)
FB + SampleEntropy(m=2, r=0.5)
FB + PermutationEntropy(order=3, delay=1, normalize=True)
FB + RangeCountPercentage(range_min=0.4, range_max=1.5)
FB + RangeCountPercentage(range_min=-0.5, range_max=0.5)
FB + DominantFrequency(low_cutoff=1.0, high_cutoff=3.5)
FB + DominantFrequencyValue(low_cutoff=0.25, high_cutoff=5.0)
FB + PowerSpectralSum(low_cutoff=1.0, high_cutoff=3.5)
FB + SpectralFlatness(low_cutoff=0.0, high_cutoff=6.0)
FB + SpectralEntropy(low_cutoff=0.0, high_cutoff=5.0)
FB + DetailPowerRatio(wavelet='coif4', freq_band=[1.0, 3.0])

In [14]:
X_pca, feature_names = FB.compute(X, fs=50.0, windowed=True, columns=['PC1', 'PC2', 'PC3'])



In [15]:
X_mf, mf_fnames = FB.compute(X_mf, fs=50.0, windowed=True, columns=[''])

In [16]:
X_mbf, mbf_fnames = FB.compute(X_mbf, fs=50.0, windowed=True, columns=[''])

In [17]:
feats_pca = pd.DataFrame(data={'Label': Y}, columns=['Label'] + feature_names)
feats_pca.iloc[:, 1:] = X_pca

feats_mf = pd.DataFrame(data={'Label': Y}, columns=['Label'] + mf_fnames)
feats_mf.iloc[:, 1:] = X_mf

feats_mbf = pd.DataFrame(data={'Label': Y}, columns=['Label'] + mbf_fnames)
feats_mbf.iloc[:, 1:] = X_mbf

# PPScore

In [18]:
import ppscore

In [19]:
pca_predictors = ppscore.predictors(feats_pca, 'Label', output='df')
plt.figure(figsize=(10, 5))
ax = sns.barplot(data=pca_predictors, x="x", y="ppscore")
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [20]:
mf_predictors = ppscore.predictors(feats_mf, 'Label', output='df')
plt.figure(figsize=(10, 5))
ax = sns.barplot(data=mf_predictors, x="x", y="ppscore")
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [21]:
mbf_predictors = ppscore.predictors(feats_mbf, 'Label', output='df')
plt.figure(figsize=(10, 5))
ax = sns.barplot(data=mbf_predictors, x="x", y="ppscore")
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# LightGBM Model

In [22]:
lgb_model = lgb.LGBMClassifier(learning_rate=0.2, random_state=12049)

# Eliminating PCA features

In [23]:
scoring = {'F1': make_scorer(f1_score), 'bal_acc': make_scorer(balanced_accuracy_score)}

bacc = []
f1 = []

pca_features = pca_predictors.x.tolist()

for i in range(len(pca_features), 0, -1):
    scores = cross_validate(
        lgb_model,
        feats_pca.loc[:, pca_features[:i]].values,
        Y,
        scoring=scoring,
        cv=zip(training_masks, validation_masks),
        n_jobs=-1
    )
    
    bacc.append(np.nanmean(scores['test_bal_acc']) * 100)
    f1.append(np.nanmean(scores['test_F1']) * 100)

In [24]:
f, ax = plt.subplots(figsize=(10, 4))
ax.plot(np.arange(len(f1), 0, -1), bacc, label='Bal. Acc.')
ax.plot(np.arange(len(f1), 0, -1), f1, label='F1')
f.tight_layout()
f.savefig('feature_elimination/pca_w-stairs_features_elimination_pps.png')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# Eliminating filtered magnitude features

In [25]:
scoring = {'F1': make_scorer(f1_score), 'bal_acc': make_scorer(balanced_accuracy_score)}

bacc = []
f1 = []

mf_features = mf_predictors.x.tolist()

for i in range(len(mf_features), 0, -1):
    scores = cross_validate(
        lgb_model,
        feats_mf.loc[:, mf_features[:i]].values,
        Y,
        scoring=scoring,
        cv=zip(training_masks, validation_masks),
        n_jobs=-1
    )
    
    bacc.append(np.nanmean(scores['test_bal_acc']) * 100)
    f1.append(np.nanmean(scores['test_F1']) * 100)

In [26]:
f, ax = plt.subplots(figsize=(10, 4))
ax.plot(np.arange(len(f1), 0, -1), bacc, label='Bal. Acc.')
ax.plot(np.arange(len(f1), 0, -1), f1, label='F1')
f.tight_layout()
f.savefig('feature_elimination/filt-mag-1_w-stairs_features_elimination_pps.png')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# Eliminating band filtered magnitude features

In [27]:
scoring = {'F1': make_scorer(f1_score), 'bal_acc': make_scorer(balanced_accuracy_score)}

bacc = []
f1 = []

mbf_features = mbf_predictors.x.tolist()

for i in range(len(mbf_features), 0, -1):
    scores = cross_validate(
        lgb_model,
        feats_mbf.loc[:, mbf_features[:i]].values,
        Y,
        scoring=scoring,
        cv=zip(training_masks, validation_masks),
        n_jobs=-1
    )
    
    bacc.append(np.nanmean(scores['test_bal_acc']) * 100)
    f1.append(np.nanmean(scores['test_F1']) * 100)

In [28]:
f, ax = plt.subplots(figsize=(10, 4))
ax.plot(np.arange(len(f1), 0, -1), bacc, label='Bal. Acc.')
ax.plot(np.arange(len(f1), 0, -1), f1, label='F1')
f.tight_layout()
f.savefig('feature_elimination/band-filter_w-stairs_features_elimination_pps.png')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [28]:
mbf_features[:18]

with open('lgb_features.txt', 'w') as f:
    for ft in mbf_features[:18]:
        f.write(ft + '\n')

In [29]:
scores = cross_validate(
    lgb_model,
    feats_mbf.loc[:, mbf_features[:18]].values,
    Y,
    scoring=scoring,
    cv=zip(training_masks, validation_masks),
    n_jobs=-1
)

print(np.mean(scores['test_bal_acc']))
print(np.mean(scores['test_F1']))

0.921219527085372
0.892434331501833
