In [1]:
%matplotlib widget

In [2]:
import pandas as pd
import numpy as np
import random # shuffle the subjects
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.base import clone
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.svm import OneClassSVM, SVC
from sklearn.covariance import EllipticEnvelope

from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.decomposition import PCA

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_curve, auc

import xgboost as xgb

In [3]:
def LOSO_runner(data, model, train_masks, validate_masks, test_masks, test=False):    
    # seperate the features out
    feats = data.drop(['Subject', 'Activity', 'Label'], axis=1)
    
    # metrics
    accu = []
    prec = []
    rec = []
    f1 = []
    
    # return the models
    mdls = []
    
    # iterate over groups of N_lo subjects to leave out
    if test:
        itr = zip(train_masks, test_masks)
    else:
        itr = zip(train_masks, validate_masks)
    for train_mask, pred_mask in itr:
        clm = clone(model)
        clm.fit(feats.loc[train_mask], data.Label[train_mask])
        
        y_pred = clm.predict(feats.loc[pred_mask])
        y_true = data.Label[pred_mask]
        
        accu.append(accuracy_score(y_true, y_pred, normalize=True))
        prec.append(precision_score(y_true, y_pred))
        rec.append(recall_score(y_true, y_pred))
        f1.append(f1_score(y_true, y_pred))
        
        mdls.append(clm)
    
    print(f'Average Accuracy: {np.mean(accu):.2f} ({np.std(accu):.2f})')
    print(f'Average Precision: {np.mean(prec):.2f} ({np.std(prec):.2f})')
    print(f'Average Recall: {np.mean(rec):.2f} ({np.std(rec):.2f})')
    print(f'Average F1 Score: {np.mean(f1):.2f} ({np.std(f1):.2f})')
        
    return mdls, feats.columns, accu, prec, rec, f1

In [4]:
def xgb_LOSO_runner(data, train_masks, validate_masks, test_masks, test=False):
    feats = data.drop(['Subject', 'Activity', 'Label'], axis=1)
    labels = data.Label
    
    # metrics
    accu = []
    prec = []
    rec = []
    f1 = []
    
    # return the models
    mdls = []
    
    # iterate over groups of N_lo subjects to leave out    
    for train_mask, val_mask, test_mask in zip(train_masks, validate_masks, test_masks):
        dtrain = xgb.DMatrix(feats.loc[train_mask], label=labels.loc[train_mask])
        dval = xgb.DMatrix(feats.loc[val_mask], label=labels.loc[val_mask])
        if test:
            dtest = xgb.DMatrix(feats.loc[test_mask], label=labels.loc[test_mask])

        params = {
            'max_depth': 8,  # default
            'eval_metric': ['rmse', 'auc']
        }
        eval_list = [(dval, 'eval'), (dtrain, 'train')]

        bst = xgb.train(params, dtrain, 10, eval_list, verbose_eval=False)
        
        if test:
            y_pred = bst.predict(dtest) > 0.5
            y_true = labels.loc[test_mask]
        else:
            y_pred = bst.predict(dval) > 0.5
            y_true = labels.loc[val_mask]
        
        accu.append(accuracy_score(y_true, y_pred, normalize=True))
        prec.append(precision_score(y_true, y_pred))
        rec.append(recall_score(y_true, y_pred))
        f1.append(f1_score(y_true, y_pred))
        
        mdls.append(bst)
        
    print(f'Average Accuracy: {np.mean(accu):.2f} ({np.std(accu):.2f})')
    print(f'Average Precision: {np.mean(prec):.2f} ({np.std(prec):.2f})')
    print(f'Average Recall: {np.mean(rec):.2f} ({np.std(rec):.2f})')
    print(f'Average F1 Score: {np.mean(f1):.2f} ({np.std(f1):.2f})')
        
    return mdls, feats.columns, accu, prec, rec, f1

In [16]:
def unary_LOSO_runner(data, model, train_masks, validate_masks, test_masks, test=False):
    feats = data.drop(['Subject', 'Activity', 'Label'], axis=1)
    labels = data.Label
    
    # metrics
    accu = []
    prec = []
    rec = []
    f1 = []
    
    # return the models
    mdls = []
    
    # iterate over groups of N_lo subjects to leave out    
    if test:
        itr = zip(train_masks, test_masks)
    else:
        itr = zip(train_masks, validate_masks)
    for train_mask, pred_mask in itr:
        clm = clone(model)
        clm.fit(feats.loc[train_mask & (labels == 1)])
        
        y_pred = clm.predict(feats.loc[pred_mask])
        y_pred[y_pred == -1] = 0
        y_true = data.Label[pred_mask]
        
        accu.append(accuracy_score(y_true, y_pred, normalize=True))
        prec.append(precision_score(y_true, y_pred))
        rec.append(recall_score(y_true, y_pred))
        f1.append(f1_score(y_true, y_pred))
        
        mdls.append(clm)
        
    print(f'Average Accuracy: {np.mean(accu):.2f} ({np.std(accu):.2f})')
    print(f'Average Precision: {np.mean(prec):.2f} ({np.std(prec):.2f})')
    print(f'Average Recall: {np.mean(rec):.2f} ({np.std(rec):.2f})')
    print(f'Average F1 Score: {np.mean(f1):.2f} ({np.std(f1):.2f})')
        
    return mdls, feats.columns, accu, prec, rec, f1

In [5]:
def plot_RF_feature_importances(models, features):
    ft_impt = pd.DataFrame(columns=['Feature', 'Importance'])
    ft_impt['Importance'] = [i for L in models for i in L.feature_importances_]
    ft_impt['Feature'] = np.tile(features, len(models))

    f, ax = plt.subplots(figsize=(10, 6))
    sns.violinplot(x='Feature', y='Importance', data=ft_impt, ax=ax, width=1, scale='width', bw=0.25)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
    f.tight_layout()

In [6]:
def plot_xgb_feature_gains(models):
    df = pd.DataFrame(columns=['Feature', 'Split', 'Gain'])
    
    df['Feature'] = [i for d in models for i in d.get_score(importance_type='gain')]
    df['Split'] = np.repeat(np.arange(len(models)), len(models[0].get_score()))
    df['Gain'] = [i for d in models for i in d.get_score(importance_type='gain').values()]
    
    f, ax = plt.subplots(figsize=(10, 8))
    sns.boxplot(x='Feature', y='Gain', data=df)
    
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
    f.tight_layout()

In [7]:
data = pd.read_hdf('../feature_exploration/features.h5', key='no_preprocessing')

In [8]:
# get the subjects for which LOSO actually makes sense: those with multiple activities (ie more than just walking)
gbc = data.groupby(['Subject', 'Activity'], as_index=False).count()
loso_subjects = [i for i in gbc.Subject.unique() if gbc.loc[gbc.Subject == i].shape[0] > 3]

random.seed(5)  # fix the generation so that its the same every time
random.shuffle(loso_subjects)

training_masks = []
validation_masks = []
testing_masks = []

for i in range(0, len(loso_subjects), 3):
    tr_m = np.ones(data.shape[0], dtype='bool')
    v_m = np.zeros(data.shape[0], dtype='bool')
    
    for j in range(3):
        tr_m &= data.Subject != loso_subjects[i+j]
    for j in range(2):
        v_m |= data.Subject == loso_subjects[i+j]
    te_m = data.Subject == loso_subjects[i+2]
    
    training_masks.append(tr_m)
    validation_masks.append(v_m)
    testing_masks.append(te_m)

masks = (training_masks, validation_masks, testing_masks)

### Domain expert feature selection

In [21]:
de_data = data.drop([
    'Mean',
    'Skewness',
    'Kurtosis',
    'RMS',
    'LinearSlope',
    'ComplexityInvariantDistance',
    'RangeCountPercentage',
    'RatioBeyondRSigma',
    'JerkMetric',
    'DominantFrequencyValue',
    'DetailPower',
    'SignalEntropy',
    'SpectralFlatness'
], axis=1)

### Scale features

In [33]:
rs_data = data.copy()
rs_data.iloc[:, 3:] = RobustScaler().fit_transform(data.iloc[:, 3:])
ss_data = data.copy()
ss_data.iloc[:, 3:] = StandardScaler().fit_transform(data.iloc[:, 3:])

### PCA

In [61]:
pca = PCA(n_components=0.95)
pca_data = data.iloc[:, :3].copy()
pca_data = pca_data.merge(pd.DataFrame(pca.fit_transform(rs_data.iloc[:, 3:])), left_index=True, right_index=True)

# Binary Classifiers

## Random Forest

In [62]:
RF = RandomForestClassifier(n_estimators=20)

rf_mdls, fts, *rf_metrics = LOSO_runner(data, RF, training_masks, validation_masks, testing_masks, test=False)
print('\nRobust Scaling')
rf_rs_mdls, rs_fts, *rf_rs_metrics = LOSO_runner(rs_data, RF, *masks, test=False)
print('\nStandard Scaling')
rf_ss_mdls, ss_fts, *rf_ss_metrics = LOSO_runner(ss_data, RF, *masks, test=False)
print('\nPCA (robust scaling)')
rf_pca_mdls, pca_fts, *rf_pca_metrics = LOSO_runner(pca_data, RF, *masks, test=False)

Average Accuracy: 0.92 (0.03)
Average Precision: 0.84 (0.09)
Average Recall: 0.81 (0.15)
Average F1 Score: 0.81 (0.10)

Robust Scaling
Average Accuracy: 0.92 (0.03)
Average Precision: 0.85 (0.09)
Average Recall: 0.81 (0.14)
Average F1 Score: 0.82 (0.09)

Standard Scaling
Average Accuracy: 0.92 (0.03)
Average Precision: 0.85 (0.09)
Average Recall: 0.81 (0.15)
Average F1 Score: 0.82 (0.09)

PCA (robust scaling)
Average Accuracy: 0.89 (0.04)
Average Precision: 0.76 (0.11)
Average Recall: 0.77 (0.14)
Average F1 Score: 0.76 (0.10)


In [36]:
plot_RF_feature_importances(rf_mdls, fts)
plot_RF_feature_importances(rf_rs_mdls, fts)
plot_RF_feature_importances(rf_ss_mdls, fts)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [12]:
# data_subset = data.drop(['Skewness', 'Kurtosis', 'Autocorrelation', 'LinearSlope', 'SignalEntropy', 
#                          'ComplexityInvariantDistance', 'RangeCountPercentage', 'RatioBeyondRSigma',
#                          'SpectralFlatness', 'Mean', 'MeanCrossRate', 'DominantFrequencyValue', 'RMS', 'DetailPowerRatio'], axis=1)
data_subset = data.drop([
    'LinearSlope',
    'RatioBeyondRSigma',
    'SpectralFlatness',
    'ComplexityInvariantDistance',
    'Kurtosis',
    'Autocorrelation',
    'DominantFrequencyValue',
    'SignalEntropy',
    'RangeCountPercentage'
], axis=1)

In [13]:
# check performance after dropping some of the less important features
rfsub_mdls, sub_fts, *rfsub_metrics = LOSO_runner(data_subset, RF, training_masks, validation_masks, testing_masks, test=False)

Average Accuracy: 0.92 (0.03)
Average Precision: 0.84 (0.09)
Average Recall: 0.81 (0.14)
Average F1 Score: 0.81 (0.09)


In [14]:
plot_RF_feature_importances(rfsub_mdls, sub_fts)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [22]:
# check performance using DE features
rfde_mdls, de_fts, *rfde_metrics = LOSO_runner(de_data, RF, training_masks, validation_masks, testing_masks, test=False)

Average Accuracy: 0.91 (0.04)
Average Precision: 0.80 (0.10)
Average Recall: 0.80 (0.14)
Average F1 Score: 0.79 (0.09)


## SVM Classifier

In [18]:
svc = SVC(C=1.0, kernel='rbf')

svc_mdls, fts, *svc_metrics = LOSO_runner(data_subset, svc, training_masks, validation_masks, testing_masks, test=False)

Average Accuracy: 0.87 (0.05)
Average Precision: 0.82 (0.05)
Average Recall: 0.88 (0.15)
Average F1 Score: 0.84 (0.09)


## XGBoost

In [63]:
print('No Scaling')
xgb_mdls, xgb_fts, xgb_accu, xgb_prec, xgb_rec, xgb_f1 = xgb_LOSO_runner(data, training_masks, validation_masks, testing_masks, test=False)
print('\nRobust Scaling')
xgb_rs_mdls, xgb_rs_fts, *xgb_rs_metr = xgb_LOSO_runner(rs_data, *masks, test=False)
print('\nStandard Scaling')
xgb_ss_mdls, xgb_ss_fts, *xgb_ss_metr = xgb_LOSO_runner(ss_data, *masks, test=False)
print('\nPCA (robust scaling)')
xgb_pca_mdls, xgb_pca_fts, *xgb_pca_metr = xgb_LOSO_runner(pca_data, *masks, test=False)

No Scaling
Average Accuracy: 0.91 (0.04)
Average Precision: 0.81 (0.10)
Average Recall: 0.82 (0.15)
Average F1 Score: 0.80 (0.09)

Robust Scaling
Average Accuracy: 0.91 (0.04)
Average Precision: 0.81 (0.10)
Average Recall: 0.82 (0.15)
Average F1 Score: 0.80 (0.09)

Standard Scaling
Average Accuracy: 0.91 (0.04)
Average Precision: 0.81 (0.10)
Average Recall: 0.82 (0.15)
Average F1 Score: 0.80 (0.09)

PCA (robust scaling)
Average Accuracy: 0.88 (0.04)
Average Precision: 0.71 (0.13)
Average Recall: 0.81 (0.14)
Average F1 Score: 0.75 (0.10)


In [17]:
# data_subset_2 = data.drop(['Mean', 'RatioBeyondRSigma', 'SpectralEntropy', 'SpectralFlatness', 'LinearSlope', 'RMS', 'Skewness', 
#                            'Kurtosis', 'Autocorrelation', 'DominantFrequencyValue', 'IQR', 
#                            'MeanCrossRate', 'ComplexityInvariantDistance', 'RangeCountPercentage', 'SignalEntropy', 
#                            'DimensionlessJerk', 'DetailPowerRatio', 'JerkMetric', 'SampleEntropy'], axis=1)
data_subset_2 = data.drop([
    'LinearSlope',
    'SPARC',
    'RatioBeyondRSigma',
    'Mean',
    'Kurtosis', 
    'PowerSpectralSum',
    'IQR',
    'RangeCountPercentage',
    'DominantFrequencyValue',
    'ComplexityInvariantDistance'
], axis=1)

In [18]:
xgb_s_mdls, xgb_s_fts, xgb_s_accu, xgb_s_prec, xgb_s_rec, xgb_s_f1 = xgb_LOSO_runner(data_subset_2, training_masks, validation_masks, testing_masks, test=False)

Average Accuracy: 0.91 (0.04)
Average Precision: 0.79 (0.11)
Average Recall: 0.84 (0.14)
Average F1 Score: 0.80 (0.09)


In [19]:
plot_xgb_feature_gains(xgb_s_mdls)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [23]:
xgb_de_mdls, xgb_de_fts, xgb_de_accu, xgb_de_prec, xgb_de_rec, xgb_de_f1 = xgb_LOSO_runner(de_data, training_masks, validation_masks, testing_masks, test=False)

Average Accuracy: 0.90 (0.04)
Average Precision: 0.76 (0.12)
Average Recall: 0.83 (0.15)
Average F1 Score: 0.78 (0.10)


# One-class Classifiers

## Isolation Forest

In [17]:
IF = IsolationForest(n_estimators=20)

if_mdls, if_fts, *if_metr = unary_LOSO_runner(data, IF, *masks, test=False)

Average Accuracy: 0.71 (0.11)
Average Precision: 0.44 (0.20)
Average Recall: 0.84 (0.12)
Average F1 Score: 0.56 (0.18)


## One-class SVM

In [20]:
usvc = OneClassSVM(kernel='rbf')

usvc_mdls, _, *usvc_metr = unary_LOSO_runner(data, usvc, *masks, test=False)

Average Accuracy: 0.80 (0.05)
Average Precision: 0.54 (0.16)
Average Recall: 0.44 (0.19)
Average F1 Score: 0.46 (0.16)
