In [1]:
%matplotlib widget

In [19]:
import pandas as pd
import numpy as np
import random
from sklearn.base import clone
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.svm import OneClassSVM, SVC
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, precision_score, make_scorer
from sklearn.model_selection import cross_val_score
import xgboost as xgb
import lightgbm as lgb

In [6]:
data = pd.read_hdf('../feature_exploration/features.h5', key='no_preprocessing')
feats = data.iloc[:, 3:]
labels = data.Label

In [23]:
data.Activity.unique()

array(['sitting', 'standing', 'walking', 'walking-impaired',
       'standing-assisted', 'sit-to-stand', 'cycling-100W', 'cycling-50W',
       'jumping-rope', 'lying', 'running-treadmill', 'stairs-ascending',
       'stairs-descending', 'sweeping', 'vacuuming', 'washing-dishes',
       'elevator-ascending', 'elevator-descending', 'jumping', 'running',
       'sleeping', 'walking-left', 'walking-right'], dtype=object)

In [31]:
# get the k-fold splits, but on subjects
gbc = data.groupby(['Subject', 'Activity'], as_index=False).count()
fold_subjects = [i for i in gbc.Subject.unique() if gbc.loc[gbc.Subject == i].shape[0] > 3]

random.seed(5)  # fix the generation for repeatability
random.shuffle(fold_subjects)

trn_m, utrn_m, val_m, tst_m = [], [], [], []
n_lo = 4
for i in range(0, len(fold_subjects), n_lo):
    trn_m.append(np.ones(data.shape[0], dtype='bool'))
    utrn_m.append(np.ones(data.shape[0], dtype='bool'))
    val_m.append(np.zeros(data.shape[0], dtype='bool'))
    tst_m.append(np.zeros(data.shape[0], dtype='bool'))
    
    for j in range(n_lo):
        trn_m[-1] &= (data.Subject != fold_subjects[i+j]).values
        utrn_m[-1] &= (data.Subject != fold_subjects[i+j]).values
    for j in range(n_lo - (n_lo // 2)):
        val_m[-1] |= (data.Subject == fold_subjects[i+j]).values
    for j in range(n_lo - (n_lo // 2), n_lo):
        tst_m[-1] |= (data.Subject == fold_subjects[i+j]).values
    
    # make the unary (one-class) classifier mask to be 
    # only walking activities for the specific subjects
    mask = np.zeros(data.shape[0], dtype='bool')
    for a in [i for i in data.Activity.unique() if 'walking' in i]:
        mask |= (data.Activity == a).values
    utrn_m[-1] &= mask

cv = tuple(zip(trn_m, val_m))
ucv = tuple(zip(utrn_m, val_m))

In [37]:
def uf1_scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    y_pred[y_pred == -1] = 0
    
    return f1_score(y, y_pred)

# Overground Walking Classification Summary

This is a summary of the work-process for classifying overground gait from a lumbar accelerometer

## 0. Activity/Class balancing

Window length: 3s
Overlap: variable
Sampling Frequency: 50Hz (downsampled where necessary)

Loading data with a constand window for all activities didn't result in a nice distribution of activities, especially those which need to be captured for the classifier to work well, such as running, or stair ascent/descent.

In order to provide a better balance of classes, the spacing or overlap between windows was adjust per activity to the following:

| Activity             | Overlap |
|----------------------|---------|
| jumping-rope         | 0.15    |
| stairs-descending    | 0.10    |
| stairs-ascending     | 0.10    |
| jumping              | 0.15    |
| lying                | 0.15    |
| elevator-ascending   | 0.15    |
| elevator-descending  | 0.15    |
| running              | 0.075   |
| sweeping             | 0.15    |
| standing             | 225     |
| running-treadmill    | 0.10    |
| cycling-50W          | 0.12    |
| cycling-100W         | 0.12    |
| walking-left         | 0.20    |
| walking-right        | 0.20    |
| walking-impaired     | 0.20    |
| walking              | 0.25    |
| sitting              | 400     |
| default              | 0.50    |

where a float indicates the % overlap between adjacent windows (ie 0.5 would be 50% overlap, or 75 samples between starts), and an integer indicates the number of samples between window starts (so 400 would exclude data of samples 151-400)

This results in the following distribution of positive/negative samples (stair ascending/descending in the negative class for now):

| Category    | Samples |
|-------------|---------|
| Total       | 105,029 |
| Walking     | 32,781  |
| Non-walking | 73,986  |
|-------------|---------|
| % walking   | 30.70%  |

and the following specific activity breakdown:

| Activity                 | Samples / Total   | % Samples |
|--------------------------|-------------------|-----------|
| sit-to-stand             |     30 / 105,029  |  0.00     |
| standing-assisted        |    401 / 105,029  |  0.00     |
| vacuuming                |    736 / 105,029  |  0.01     |
| lying                    |    739 / 105,029  |  0.01     |
| jumping-rope             |  1,373 / 105,029  |  0.01     |
| washing-dishes           |  1,538 / 105,029  |  0.01     |
| jumping                  |  1,869 / 105,029  |  0.02     |
| sleeping                 |  2,245 / 105,029  |  0.02     |
| elevator-descending      |  3,070 / 105,029  |  0.03     |
| elevator-ascending       |  3,119 / 105,029  |  0.03     |
| walking-left             |  3,774 / 105,029  |  0.04     |
| standing                 |  3,875 / 105,029  |  0.04     |
| walking-right            |  4,023 / 105,029  |  0.04     |
| sweeping                 |  4,090 / 105,029  |  0.04     |
| cycling-50W              |  6,251 / 105,029  |  0.06     |
| cycling-100W             |  6,266 / 105,029  |  0.06     |
| running                  |  6,764 / 105,029  |  0.06     |
| sitting                  |  6,920 / 105,029  |  0.07     |
| stairs-descending        |  7,336 / 105,029  |  0.07     |
| running-treadmill        |  7,415 / 105,029  |  0.07     |
| stairs-ascending         |  8,211 / 105,029  |  0.08     |
| walking-impaired         |  8,241 / 105,029  |  0.08     |
| walking                  | 16,743 / 105,029  |  0.16     |

Features were compute on the magnitude of the acceleration, computed per:
$$a_{mag} = \sqrt{a_x^2+a_y^2+a_z^2}$$

The following features were computed:

| Feature                      | Parameters | 
|------------------------------|------------|
| Mean                         |                                         |
| MeanCrossRate                |                                         |
| StdDev                       |                                         |
| Skewness                     |                                         |
| Kurtosis                     |                                         |
| Range                        |                                         |
| IQR                          |                                         |
| RMS                          |                                         |
| Autocorrelation              | lag=1, normalize=True                   |
| LinearSlope                  |                                         |
| SignalEntropy                |                                         |
| SampleEntropy                | m=4, r=1.0                              |
| PermutationEntropy           | order=3, delay=1, normalize=True        |
| ComplexityInvariantDistance  | normalize=True                          |
| RangeCountPercentage         | range_min=0, range_max=1.0              |
| RatioBeyondRSigma            | r=2.0                                   |
| JerkMetric                   | normalize=True                          |
| DimensionlessJerk            | log=True, signal_type='acceleration'    |
| SPARC                        |                                         |
| DominantFrequency            | low_cutoff=0.25, high_cutoff=5.0        |
| DominantFrequencyValue       | low_cutoff=0.25, high_cutoff=5.0        |
| PowerSpectralSum             | low_cutoff=0.25, high_cutoff=5.0        |
| SpectralFlatness             | low_cutoff=0.25, high_cutoff=5.0        |
| SpectralEntropy              | low_cutoff=0.25, high_cutoff=5.0        |
| DetailPower                  | wavelet='coif4', freq_band=[1.0, 3.0]   |
| DetailPowerRatio             | wavelet='coif4', freq_band=[1.0, 3.0]   |

<div class="alert alert-block alert-warning">
<b>TODO:</b> Explore effect of changing parameters. Could use PPScore to see effect on classification
</div>

## 1. Model Selection

Initial testing on binary classifiers SVM/RandomForest/XGBoost and one-class classifiers IsolationForest/OneClassSVM

Initial testing was done with stair ascent/descent as the __*negative*__ class

Overall performance showed that a binary classifier would achieve higher results. Initially dropping some features showed little/no change in performance, but as features were dropped based on feature performance in the classifiers this isn't surprising

### 1a: Random Forest

In [32]:
RF = RandomForestClassifier(n_estimators=20)
cv_res = cross_val_score(
    RF, 
    feats, 
    labels, 
    scoring=make_scorer(f1_score), 
    cv=cv, 
    n_jobs=-1
)
print(f'Average F1: {np.mean(cv_res)*100:.2f}')

Average F1: 81.32


### 1b: SVM
SVM takes a while to run

Average F1: ~58.84

In [21]:
svm = SVC(C=1.0, kernel='rbf')
run = False
if run:
    cv_res = cross_val_score(
        svm, 
        feats,
        labels,
        scoring=make_scorer(f1_score),
        cv=cv,
        n_jobs=-1
    )
    print(f'Average F1: {np.mean(cv_res)*100:.2f}')

Average F1: 58.84


### 1c. XGBoost
In the end showed similar performance to the random forest, but has the benefit of faster run times and better persistence/saving

In [33]:
clf = xgb.XGBClassifier()
cv_res = cross_val_score(
    clf,
    feats,
    labels,
    scoring=make_scorer(f1_score),
    cv=cv,
    n_jobs=-1
)
print(f'Average F1: {np.mean(cv_res)*100:.2f}')

Average F1: 84.01


### 1d: Isolation Forest

In [38]:
IF = IsolationForest(n_estimators=20)
cv_res = cross_val_score(
    IF,
    feats,
    labels,
    scoring=uf1_scorer,
    cv=ucv,
    n_jobs=-1
)
print(f'Average F1: {np.mean(cv_res)*100:.2f}')

Average F1: 61.86


### 1e: One-class SVM
Also takes a while

Average F1: ~48.52

In [39]:
usvc = OneClassSVM(kernel='rbf')
run = False
if run:
    cv_res = cross_val_score(
        usvc,
        feats,
        labels,
        scoring=uf1_scorer,
        cv=ucv,
        n_jobs=-1
    )
    print(f'Average F1: {np.mean(cv_res)*100:.2f}')

Average F1: 48.52


Given the large number of classes, it isn't surprising that the binary classifiers end up working better, given that they should be able to account for the feature space occupied by the other activities better