In [5]:
%matplotlib widget

In [2]:
import pandas as pd
import numpy as np
import random
import seaborn as sns
import matplotlib.pyplot as plt

import ast

from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score

In [3]:
def LOSO_runner(data, model, train_masks, validate_masks, test_masks, test=False):
    # seperate features out
    feats = data.drop(['Subject', 'Activity', 'Label'], axis=1)
    
    # metrics
    accu, prec, rec, f1 = [], [], [], []
    
    # models
    mdls = []
    
    # predictions
    predictions, truths = [], []
    
    # iterate over masks
    if test:
        itr = zip(train_masks, test_masks)
    else:
        itr = zip(train_masks, validate_masks)
    
    for mtrain, mpred in itr:
        clm = clone(model)
        clm.fit(feats.loc[mtrain], data.Label[mtrain])
        
        y_pred_ = clm.predict_proba(feats.loc[mpred])
        y_pred = clm.predict(feats.loc[mpred])
        y_true = data.Label[mpred]
        
        predictions.append(y_pred_)
        truths.append(y_true)
        
        accu.append(accuracy_score(y_true, y_pred, normalize=True))
        prec.append(precision_score(y_true, y_pred))
        rec.append(recall_score(y_true, y_pred))
        f1.append(f1_score(y_true, y_pred))
        
        mdls.append(clm)
        
    print(f'Average Accuracy: {np.mean(accu):.2f} ({np.std(accu):.2f})')
    print(f'Average Precision: {np.mean(prec):.2f} ({np.std(prec):.2f})')
    print(f'Average Recall: {np.mean(rec):.2f} ({np.std(rec):.2f})')
    print(f'Average F1 Score: {np.mean(f1):.2f} ({np.std(f1):.2f})')
        
    return mdls, feats.columns, predictions, truths, accu, prec, rec, f1

In [4]:
data = pd.read_hdf('../feature_exploration/features.h5', key='incl_stairs')

In [6]:
# get the subjects for which LOSO actually makes sense: those with multiple activities (ie more than just walking)
gbc = data.groupby(['Subject', 'Activity'], as_index=False).count()
loso_subjects = [i for i in gbc.Subject.unique() if gbc.loc[gbc.Subject == i].shape[0] > 3]

random.seed(5)  # fix the generation so that its the same every time
random.shuffle(loso_subjects)

training_masks = []
validation_masks = []
testing_masks = []

for i in range(0, len(loso_subjects), 4):
    tr_m = np.ones(data.shape[0], dtype='bool')
    v_m = np.zeros(data.shape[0], dtype='bool')
    te_m = np.zeros(data.shape[0], dtype='bool')
    
    for j in range(4):
        tr_m &= (data.Subject != loso_subjects[i+j]).values
    for j in range(2):
        v_m |= (data.Subject == loso_subjects[i+j]).values
    for j in range(2):
        te_m |= (data.Subject == loso_subjects[i+j+2]).values
    
    training_masks.append(tr_m)
    validation_masks.append(v_m)
    testing_masks.append(te_m)

masks = (training_masks, validation_masks, testing_masks)

In [7]:
cvr = pd.read_csv('rfc_cv_results_topfeats.csv', index_col=0)
params = ast.literal_eval(
    cvr.sort_values('rank_test_score').params.values[0]
)

In [8]:
model = RandomForestClassifier(**params)

In [10]:
_, _, preds, truths, *_ = LOSO_runner(
    data,
    model,
    *(training_masks, validation_masks, testing_masks),
    test=False
)

Average Accuracy: 0.95 (0.05)
Average Precision: 0.94 (0.08)
Average Recall: 0.95 (0.09)
Average F1 Score: 0.94 (0.06)


In [11]:
f, (ax, ax1) = plt.subplots(ncols=2, figsize=(10, 6), sharey=True)

opt_trsh = []

i = 1
for p_, t_ in zip(preds, truths):
    fpr, tpr, thrsh = roc_curve(t_, p_[:, 1])
    
    # compute optimal threshold based on Youdin's index
    dist = tpr - fpr
    
    opt_trsh.append(thrsh[np.argmax(dist)])
    
    ax.plot(fpr, tpr, label=f'Fold{i:3d}: {auc(fpr, tpr):.2f}')
    ax1.plot(thrsh, dist, label=f'Fold{i:3d}: {opt_trsh[-1]:.2f}')
    i += 1

ax.legend(loc='best')
ax1.legend(loc=3)

ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
ax1.axvline(0.5, color='navy', lw=2, linestyle='--')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax1.set_xlim([0.0, 1.0])

ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')

f.tight_layout()

print(np.mean(opt_trsh), np.median(opt_trsh))

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

0.5241609248934246 0.5946301265057008


In [12]:
f, ax = plt.subplots(figsize=(10, 6), sharey=True)

i = 1
for p_, t_ in zip(preds, truths):
    precision, recall, thrsh = precision_recall_curve(t_, p_[:, 1])
    ap = average_precision_score(t_, p_[:, 1])
    
    ax.plot(recall, precision, label=f'Fold{i:3d}: {ap:.2f}')
    i += 1
    
f_scores = np.linspace(0.2, 0.8, num=4)
lines = []
labels = []
for f_score in f_scores:
    x = np.linspace(0.01, 1)
    y = f_score * x / (2 * x - f_score)
    l, = ax.plot(x[y >= 0], y[y >= 0], color='gray', alpha=0.2)
    ax.annotate('f1={0:0.1f}'.format(f_score), xy=(0.9, y[45] + 0.02))

ax.legend(loc='best')

ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])

ax.set_xlabel('Recall')
ax.set_ylabel('Precision')

f.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [17]:
fp_acts = []
fp_fold = []
fp_certainty = []

i = 1
for mask, pred, truth in zip(validation_masks, preds, truths):
    false_pos = mask.copy()
    false_pos[false_pos] &= (pred[:, 1] > 0.6) & (truth == 0)
    
    fp_acts.extend(data.Activity.loc[false_pos].tolist())
    fp_fold.extend([i] * false_pos.sum())
    fp_certainty.extend(pred[false_pos[mask], 1])
    
    i += 1

fp = pd.DataFrame(data={
    'Activity': fp_acts,
    'Fold': fp_fold, 
    'Certainty': fp_certainty
})

In [22]:
fp.groupby('Activity').count().sort_values('Fold')

Unnamed: 0_level_0,Fold,Certainty
Activity,Unnamed: 1_level_1,Unnamed: 2_level_1
elevator-descending,1,1
sitting,2,2
standing-assisted,2,2
washing-dishes,2,2
running,3,3
vacuuming,3,3
standing,9,9
cycling-100W,203,203
sweeping,210,210
cycling-50W,276,276


In [19]:
f, ax = plt.subplots(figsize=(9, 5))

sns.boxplot(x='Activity', y='Certainty', hue='Fold', data=fp, ax=ax)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

ax.set_title('False Positive Certainty Distribution')

f.tight_layout()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …