In [1]:
%matplotlib widget

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

from PfyMU.gait.train_classifier.core import load_datasets

In [3]:
steps = {
    'jumping-rope': 0.15,
    'stairs-descending': 0.1,
    'stairs-ascending': 0.1,
    'jumping': 0.15,
    'lying': 0.15,
    'elevator-ascending': 0.15,
    'elevator-descending': 0.15,
    'running': 0.075,
    'sweeping': 0.15,
    'standing': 225,
    'running-treadmill': 0.1,
    'cycling-50W': 0.12,
    'cycling-100W': 0.12,
    'walking-left': 0.2,
    'walking-right': 0.2,
    'walking-impaired': 0.2,
    'walking': 0.25,
    'sitting': 400,
    'default': 0.5
}



In [4]:
# gait_sets_path = Path('/Users/adamol/Documents/Datasets/gait/processed')
gait_sets_path = Path('/home/lukasadamowicz/Documents/Datasets/processed')

datasets = [
    gait_sets_path / 'bluesky2',
    gait_sets_path / 'daliac',
    gait_sets_path / 'ltmm',
    gait_sets_path / 'usc-had'
]

X, Y, subjects, activities = load_datasets(
    datasets, 
    goal_fs=50.0, 
    acc_mag=True, 
    window_length=3.0, 
    window_step=steps
)

In [6]:
# make stair-climbing in the positive class
# mask = (activities == 'stairs-ascending') | (activities == 'stairs-descending')
# Y[mask] = 1

## Dataset class summary

In [7]:
print('Total samples (3.0s windows): ', Y.size)
print('Total walking samples: ', Y.sum())
print('Total non-walking samples: ', Y.size - Y.sum(), '\n')
print(f'% walking samples: {Y.sum() / Y.size * 100:.2f}')

Total samples (3.0s windows):  106767
Total walking samples:  32781
Total non-walking samples:  73986 

% walking samples: 30.70


In [8]:
unq_act, act_ct = np.unique(activities, return_counts=True)
N = np.sum(act_ct)
si = np.argsort(act_ct)
for a, c in zip(unq_act[si], act_ct[si]):
    print(f'{a:25s}: {c:5d} / {N:5d}{c/N:8.2f}')

sit-to-stand             :    30 / 106767    0.00
standing-assisted        :   401 / 106767    0.00
vacuuming                :   736 / 106767    0.01
jumping-rope             :  1373 / 106767    0.01
washing-dishes           :  1538 / 106767    0.01
jumping                  :  1869 / 106767    0.02
sleeping                 :  2245 / 106767    0.02
lying                    :  2477 / 106767    0.02
elevator-descending      :  3070 / 106767    0.03
elevator-ascending       :  3119 / 106767    0.03
walking-left             :  3774 / 106767    0.04
standing                 :  3875 / 106767    0.04
walking-right            :  4023 / 106767    0.04
sweeping                 :  4090 / 106767    0.04
cycling-50W              :  6251 / 106767    0.06
cycling-100W             :  6266 / 106767    0.06
running                  :  6764 / 106767    0.06
sitting                  :  6920 / 106767    0.06
stairs-descending        :  7336 / 106767    0.07
running-treadmill        :  7415 / 106767    0.07


## Feature Generation

In [9]:
from PfyMU.features import *

In [10]:
FB = Bank(window_length=None, window_step=None)

# add features
FB + Mean()
FB + MeanCrossRate()
FB + StdDev()
FB + Skewness()
FB + Kurtosis()
FB + Range()
FB + IQR()
FB + RMS()
FB + Autocorrelation(lag=1, normalize=True)
FB + LinearSlope()
FB + SignalEntropy()
FB + SampleEntropy(m=4, r=1.0)
FB + PermutationEntropy(order=3, delay=1, normalize=True)
FB + ComplexityInvariantDistance(normalize=True)
FB + RangeCountPercentage(range_min=0, range_max=1.0)
FB + RatioBeyondRSigma(r=2.0)
FB + JerkMetric(normalize=True)
FB + DimensionlessJerk(log=True, signal_type='acceleration')
FB + SPARC()
FB + DominantFrequency(low_cutoff=0.25, high_cutoff=5.0)
FB + DominantFrequencyValue(low_cutoff=0.25, high_cutoff=5.0)
FB + PowerSpectralSum(low_cutoff=0.25, high_cutoff=5.0)
FB + SpectralFlatness(low_cutoff=0.25, high_cutoff=5.0)
FB + SpectralEntropy(low_cutoff=0.25, high_cutoff=5.0)
FB + DetailPower(wavelet='coif4', freq_band=[1.0, 3.0])
FB + DetailPowerRatio(wavelet='coif4', freq_band=[1.0, 3.0])

In [11]:
X_feat, feature_names = FB.compute(X, fs=50.0, windowed=True, columns=[''])



## Feature Exploration

In [12]:
feats = pd.DataFrame(
    index=range(X_feat.shape[0]), 
#     columns=['Subject', 'Activity', 'Label'] + feature_names,
    columns=['Label'] + feature_names,
    dtype='float'
)
# feats['Subject'] = feats.Subject.astype('str')
# feats['Activity'] = feats.Activity.astype('str')

feats.iloc[:, 1:] = X_feat
feats['Label'] = Y
feats['Label'] = feats.Label.astype('int')
# feats['Subject'] = subjects
# feats['Activity'] = activities

In [18]:
# feats.to_hdf('features.h5', key='no_preprocessing')

### Feature distributions

In [19]:
plt.close('all')

f, ax = plt.subplots(nrows=7, ncols=4, figsize=(10, 10), sharex=True)

j, k = 0, 0
for i, ft in enumerate(feature_names):
    if k > 3:
        j += 1
        k  = 0
    sns.violinplot(x='Label', y=ft, data=feats, ax=ax[j, k])
    ax[j, k].set_ylabel(None)
    ax[j, k].set_title(ft)
    ax[j, k].set_xlabel(None)
    
    k += 1

for k in range(4):
    ax[-1, k].set_xticklabels(['NG', 'G'])

f.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Predictive Power Score

In [12]:
import ppscore

In [13]:
df_predictors = ppscore.predictors(feats, 'Label', output='df')
plt.figure()
ax = sns.barplot(data=df_predictors, x="x", y="ppscore")
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
plt.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [24]:
pps_matrix = ppscore.matrix(feats, output='df')

In [25]:
plt.figure(figsize=(15, 10))
sns.heatmap(pps_matrix, vmin=0, vmax=1, cmap="Blues", linewidths=0.5, annot=True)
plt.tight_layout()
# plt.savefig('PPScore_matrix.pdf')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [27]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler

In [28]:
pca = PCA(n_components=5)
x_pca = pca.fit_transform(RobustScaler().fit_transform(feats))
df_pca = pd.DataFrame(x_pca, columns=[f'PC {i+1}' for i in range(x_pca.shape[1])])
df_pca['Label'] = Y
df_pca.head(2)

Unnamed: 0,PC 1,PC 2,PC 3,PC 4,PC 5,Label
0,-3.316644,1.451834,-1.855846,0.645684,0.80939,0
1,-3.336631,5.4586,-0.741368,0.589801,0.856591,0


In [29]:
sns.pairplot(df_pca, hue='Label')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<seaborn.axisgrid.PairGrid at 0x7f85b95d2fd0>

In [30]:
from separation_stats import db_2class, corr_select, cohen_d, ttest_select, mwu_select, auc_score

In [32]:
sep_df = pd.DataFrame()
sep_df['Feature'] = feature_names

sep_df['DBI'], db_rank = db_2class(X_feat, Y)
sep_df['Correlation r'] = corr_select(X_feat, Y)
sep_df['T-test p'], sep_df['Cohens d'] = ttest_select(X_feat, Y)
sep_df['MWU-test p'], _ = mwu_select(X_feat, Y)
sep_df['AUC'] = auc_score(X_feat, Y)

sep_df['Cohens d'] = sep_df['Cohens d'].abs()
sep_df['AUC'] = sep_df['AUC'].apply(lambda i: 1 - i if i < 0.5 else i)

In [33]:
sep_df.sort_values('DBI')

Unnamed: 0,Feature,DBI,Correlation r,T-test p,Cohens d,MWU-test p,AUC
21,_powerspectralsum,0.868489,-0.637493,0.0,1.660115,0.0,0.869955
20,_dominantfrequencyvalue,1.078426,-0.398076,0.0,0.870644,0.0,0.732411
23,_spectralentropy,1.153738,0.648139,0.0,1.707638,0.0,0.896032
12,_permutationentropy,1.317614,-0.616864,0.0,1.572484,0.0,0.872063
22,_spectralflatness,1.461314,0.46589,0.0,1.056398,0.0,0.782482
19,_dominantfrequency,1.711303,0.108109,1.7324269999999998e-270,0.218184,0.0,0.58683
17,_dimensionlessjerk,2.086894,-0.393905,0.0,0.859835,0.0,0.666175
13,_complexityinvariantdistance,2.299014,-0.442206,0.0,0.989201,0.0,0.769706
1,_meancrossrate,2.300061,-0.442711,0.0,0.990607,0.0,0.755821
8,_autocorrelation,2.870757,0.424606,0.0,0.94095,0.0,0.769898


In [34]:
sep_df.sort_values('Cohens d', ascending=False)

Unnamed: 0,Feature,DBI,Correlation r,T-test p,Cohens d,MWU-test p,AUC
23,_spectralentropy,1.153738,0.648139,0.0,1.707638,0.0,0.896032
21,_powerspectralsum,0.868489,-0.637493,0.0,1.660115,0.0,0.869955
12,_permutationentropy,1.317614,-0.616864,0.0,1.572484,0.0,0.872063
22,_spectralflatness,1.461314,0.46589,0.0,1.056398,0.0,0.782482
1,_meancrossrate,2.300061,-0.442711,0.0,0.990607,0.0,0.755821
13,_complexityinvariantdistance,2.299014,-0.442206,0.0,0.989201,0.0,0.769706
8,_autocorrelation,2.870757,0.424606,0.0,0.94095,0.0,0.769898
20,_dominantfrequencyvalue,1.078426,-0.398076,0.0,0.870644,0.0,0.732411
17,_dimensionlessjerk,2.086894,-0.393905,0.0,0.859835,0.0,0.666175
14,_rangecountpercentage,3.657401,0.363783,0.0,0.78357,0.0,0.724231


In [35]:
sep_df.sort_values('AUC', ascending=False)

Unnamed: 0,Feature,DBI,Correlation r,T-test p,Cohens d,MWU-test p,AUC
23,_spectralentropy,1.153738,0.648139,0.0,1.707638,0.0,0.896032
12,_permutationentropy,1.317614,-0.616864,0.0,1.572484,0.0,0.872063
21,_powerspectralsum,0.868489,-0.637493,0.0,1.660115,0.0,0.869955
22,_spectralflatness,1.461314,0.46589,0.0,1.056398,0.0,0.782482
8,_autocorrelation,2.870757,0.424606,0.0,0.94095,0.0,0.769898
13,_complexityinvariantdistance,2.299014,-0.442206,0.0,0.989201,0.0,0.769706
1,_meancrossrate,2.300061,-0.442711,0.0,0.990607,0.0,0.755821
20,_dominantfrequencyvalue,1.078426,-0.398076,0.0,0.870644,0.0,0.732411
25,_detailpowerratio,4.065702,0.019841,1.269698e-10,0.039817,0.0,0.727072
14,_rangecountpercentage,3.657401,0.363783,0.0,0.78357,0.0,0.724231


### Pair plots (Top Features)

In [None]:
top_feats = ['PowerSpectralSum', 'PermutationEntropy', 'SpectralEntropy', 'DominantFrequency', 'DimensionlessJerk', 'MeanCrossRate']

sns.pairplot(feats, hue='Label', vars=top_feats)