In [1]:
%matplotlib widget

In [2]:
import pandas as pd
import numpy as np
import random # shuffle the subjects
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import clone
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.svm import OneClassSVM, SVC
from sklearn.covariance import EllipticEnvelope

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

In [3]:
def LOSO_runner(data, model, train_masks, validate_masks, test_masks, test=False):    
    # seperate the features out
    feats = data.drop(['Subject', 'Activity', 'Label'], axis=1)
    
    # metrics
    accu = []
    prec = []
    rec = []
    f1 = []
    
    # return the models
    mdls = []
    
    # iterate over groups of N_lo subjects to leave out
    if test:
        itr = zip(train_masks, test_masks)
    else:
        itr = zip(train_masks, validate_masks)
    for train_mask, pred_mask in itr:
        clm = clone(model)
        clm.fit(feats.loc[train_mask], data.Label[train_mask])
        
        y_pred = clm.predict(feats.loc[pred_mask])
        y_true = data.Label[pred_mask]
        
        accu.append(accuracy_score(y_true, y_pred, normalize=True))
        prec.append(precision_score(y_true, y_pred))
        rec.append(recall_score(y_true, y_pred))
        f1.append(f1_score(y_true, y_pred))
        
        mdls.append(clm)
    
    print(f'Average Accuracy: {np.mean(accu):.2f} ({np.std(accu):.2f})')
    print(f'Average Precision: {np.mean(prec):.2f} ({np.std(prec):.2f})')
    print(f'Average Recall: {np.mean(rec):.2f} ({np.std(rec):.2f})')
    print(f'Average F1 Score: {np.mean(f1):.2f} ({np.std(f1):.2f})')
        
    return mdls, feats.columns, accu, prec, rec, f1

In [4]:
def plot_RF_feature_importances(models, features):
    ft_impt = pd.DataFrame(columns=['Feature', 'Importance'])
    ft_impt['Importance'] = [i for L in models for i in L.feature_importances_]
    ft_impt['Feature'] = np.tile(features, len(models))

    f, ax = plt.subplots(figsize=(10, 6))
    sns.violinplot(x='Feature', y='Importance', data=ft_impt, ax=ax, width=1, scale='width', bw=0.25)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
    f.tight_layout()

In [5]:
data = pd.read_hdf('../feature_exploration/features.h5', key='no_preprocessing')

In [6]:
# get the subjects for which LOSO actually makes sense: those with multiple activities (ie more than just walking)
gbc = data.groupby(['Subject', 'Activity'], as_index=False).count()
loso_subjects = [i for i in gbc.Subject.unique() if gbc.loc[gbc.Subject == i].shape[0] > 3]

random.seed(5)  # fix the generation so that its the same every time
random.shuffle(loso_subjects)

training_masks = []
validation_masks = []
testing_masks = []

for i in range(0, len(loso_subjects), 3):
    tr_m = np.ones(data.shape[0], dtype='bool')
    v_m = np.zeros(data.shape[0], dtype='bool')
    
    for j in range(3):
        tr_m &= data.Subject != loso_subjects[i+j]
    for j in range(2):
        v_m |= data.Subject == loso_subjects[i+j]
    te_m = data.Subject == loso_subjects[i+2]
    
    training_masks.append(tr_m)
    validation_masks.append(v_m)
    testing_masks.append(te_m)


# Binary Classifiers

## Random Forest

In [15]:
RF = RandomForestClassifier(n_estimators=20)

rf_mdls, fts, *rf_metrics = LOSO_runner(data, RF, training_masks, validation_masks, testing_masks, test=False)

Average Accuracy: 0.93 (0.04)
Average Precision: 0.91 (0.05)
Average Recall: 0.91 (0.10)
Average F1 Score: 0.91 (0.07)


In [16]:
plot_RF_feature_importances(rf_mdls, fts)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [21]:
# check performance after dropping some of the less important features
data_subset = data.drop(['Skewness', 'Kurtosis', 'Autocorrelation', 'LinearSlope', 'SignalEntropy', 
                         'ComplexityInvariantDistance', 'RangeCountPercentage', 'RatioBeyondRSigma',
                         'SpectralFlatness', 'Mean', 'MeanCrossRate', 'DominantFrequencyValue', 'RMS', 'DetailPowerRatio'], axis=1)
rfsub_mdls, sub_fts, *rfsub_metrics = LOSO_runner(data_subset, RF, training_masks, validation_masks, testing_masks, test=False)

Average Accuracy: 0.92 (0.04)
Average Precision: 0.89 (0.06)
Average Recall: 0.93 (0.08)
Average F1 Score: 0.91 (0.06)


In [22]:
plot_RF_feature_importances(rfsub_mdls, sub_fts)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## SVM Classifier

In [None]:
svc = SVC(C=1.0, kernel='rbf')

svc_mdls, fts, *svc_metrics = LOSO_runner(data, svc, training_masks, validation_masks, testing_masks, test=False)

# One-class Classifiers

## Isolation Forest