In [3]:
import glob

import numpy as np
import pandas as pd
from scipy.stats import mode
from sklearn.ensemble import RandomForestClassifier as RFC, ExtraTreesClassifier as ETC
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score

from tqdm import tqdm, trange

# Baseline Leave One Out

In [10]:
def segmentation(df, overlap_rate, time_window):
    seg_data = []
    #convert overlap rate to step for sliding window
    overlap = int((1 - overlap_rate)*time_window)
    # interpolate
    df = df.interpolate().ffill().fillna(0)
    for i in range(0, len(df)-time_window+1, overlap):
        seg_data.append(df.loc[i:i+time_window-1, :].copy().reset_index(drop=True))
    return seg_data

In [11]:
def get_features(x_data):
    features = []
    cols = x_data.columns.tolist()
    #Calculate features (STD, Average, Max, Min, Median, Variance) for each data columns X Y Z 
    for k in cols:
        features.append(x_data[k].std(ddof=0))
        features.append(np.average(x_data[k]))
        features.append(np.max(x_data[k]))
        features.append(np.min(x_data[k]))
        features.append(np.median(x_data[k]))        
        features.append(np.var(x_data[k]))
    return features

In [48]:
def dataloader(overlap, window_size):
    print("loading the data...", end="\t")
    data_list = []
    file_lengths = {1: [], 2: [], 3: []}
    for file in tqdm(glob.glob("../TrainData/*/*/*.csv")):
        tempdf = pd.read_csv(file)
        segmented_data = segmentation(tempdf, overlap, window_size)
        if len(segmented_data)>0:
            person = segmented_data[0].iloc[0, -2]
            file_lengths[person].append(len(segmented_data))   
        data_list.extend(segmented_data)
    return data_list, file_lengths

In [80]:
data_list, _ = dataloader(0.5, 2500)

loading the data...	

100%|██████████| 151/151 [00:10<00:00, 13.89it/s]


In [49]:
def feature_extractor(data_list):
    print(f"extracting the features...", end="  ")
    X, y = {1:[], 2:[], 3:[]}, {1:[], 2:[], 3:[]}
    for j in trange(0,len(data_list)):
        #extract only xyz columns
        person = data_list[j].loc[0, "subject_id"]
        x_data = data_list[j].drop(columns=["subject_id","activity"])
        X[person].append(get_features(x_data))
        y[person].append(data_list[j].iloc[0, -1])
    return X, y

In [78]:
def majority_voting(predictions, file_lengths):
    filtered_predictions = []
    index = 0
    for length in file_lengths:
        file_pred = predictions[index:index+length]
        modes = mode(file_pred)
        majority_choice = modes.mode[0]
        filtered_predictions.extend([majority_choice]*length)
        index += length
    return filtered_predictions

In [94]:
def LOOCV_train_evaluate(model, overlap_rate, window_size, voting=True, n_repeats=1):
    scores = []
    data_list, file_lengths = dataloader(overlap_rate, window_size)
    X, y = feature_extractor(data_list)
    for _ in trange(n_repeats):
        for p1, p2, p3 in [(1,2,3), (2,3,1), (3,1,2)]:
            X_test, y_test = X[p1], y[p1]
            X_train = X[p2] + X[p3]
            y_train = y[p2] + y[p3]
            # print(f"training model for person {p1}/3...", end="\t")
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            if voting:
                filtered_pred = majority_voting(pred, file_lengths[p1])
                scores.append(accuracy_score(y_test, filtered_pred))
            else:
                scores.append(accuracy_score(y_test, pred))
    print(f"\nMean Score: {np.mean(scores)}")
    return scores

## Tuning Experiments

In [96]:
model = RFC(300, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.5, 1000, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:10<00:00, 14.32it/s]


extracting the features...  

100%|██████████| 1667/1667 [00:55<00:00, 29.84it/s]
100%|██████████| 5/5 [00:30<00:00,  6.01s/it]



Mean Score: 0.3548648832961903


In [98]:
model = RFC(300, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.5, 1000, voting=False, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:10<00:00, 14.11it/s]


extracting the features...  

100%|██████████| 1667/1667 [00:43<00:00, 38.38it/s]
100%|██████████| 5/5 [00:25<00:00,  5.00s/it]


Mean Score: 0.3409629927788467





In [99]:
model = RFC(300, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0, 1000, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:11<00:00, 13.70it/s]


extracting the features...  

100%|██████████| 865/865 [00:23<00:00, 37.34it/s]
100%|██████████| 5/5 [00:14<00:00,  2.98s/it]


Mean Score: 0.33890826659698253





In [103]:
model = RFC(300, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.5, 1500, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:10<00:00, 14.44it/s]


extracting the features...  

100%|██████████| 1037/1037 [00:30<00:00, 33.80it/s]
100%|██████████| 5/5 [00:20<00:00,  4.03s/it]


Mean Score: 0.45532755277177184





In [104]:
model = RFC(300, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.5, 1500, voting=False, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:10<00:00, 13.83it/s]


extracting the features...  

100%|██████████| 1037/1037 [00:30<00:00, 34.08it/s]
100%|██████████| 5/5 [00:21<00:00,  4.21s/it]


Mean Score: 0.4067814952683107





In [118]:
model = RFC(300, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.5, 2000, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:09<00:00, 16.64it/s]


extracting the features...  

100%|██████████| 714/714 [00:18<00:00, 38.47it/s]
100%|██████████| 5/5 [00:14<00:00,  2.84s/it]


Mean Score: 0.5002785642174731





In [119]:
model = RFC(300, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0, 2000, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:12<00:00, 12.41it/s]


extracting the features...  

100%|██████████| 390/390 [00:15<00:00, 25.52it/s]
100%|██████████| 5/5 [00:14<00:00,  2.96s/it]


Mean Score: 0.47207859134760327





In [105]:
model = RFC(300, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.5, 2500, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:11<00:00, 13.56it/s]


extracting the features...  

100%|██████████| 546/546 [00:22<00:00, 23.88it/s]
100%|██████████| 5/5 [00:12<00:00,  2.57s/it]


Mean Score: 0.5208222840205537





In [106]:
model = RFC(300, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.5, 2500, voting=False, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:09<00:00, 15.91it/s]


extracting the features...  

100%|██████████| 546/546 [00:19<00:00, 28.65it/s]
100%|██████████| 5/5 [00:13<00:00,  2.65s/it]


Mean Score: 0.49344253327382387





In [107]:
model = RFC(300, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 2500, voting=True, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:09<00:00, 15.94it/s]


extracting the features...  

100%|██████████| 998/998 [00:28<00:00, 35.17it/s]
100%|██████████| 5/5 [00:16<00:00,  3.32s/it]


Mean Score: 0.5192277058460849





In [108]:
model = RFC(300, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 2500, voting=False, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:11<00:00, 12.59it/s]


extracting the features...  

100%|██████████| 998/998 [00:41<00:00, 23.77it/s]
100%|██████████| 5/5 [00:19<00:00,  3.81s/it]


Mean Score: 0.49908890570408987





In [109]:
model = RFC(300, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 3000, voting=True, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:12<00:00, 12.23it/s]


extracting the features...  

100%|██████████| 736/736 [00:34<00:00, 21.36it/s]
100%|██████████| 5/5 [00:16<00:00,  3.39s/it]


Mean Score: 0.5505915614443272





In [110]:
model = RFC(100, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 3000, voting=True, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:17<00:00,  8.62it/s]


extracting the features...  

100%|██████████| 736/736 [00:33<00:00, 21.71it/s]
100%|██████████| 5/5 [00:04<00:00,  1.06it/s]


Mean Score: 0.5428947860356371





In [111]:
model = ETC(300, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 3000, voting=True, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:10<00:00, 14.52it/s]


extracting the features...  

100%|██████████| 736/736 [00:25<00:00, 28.70it/s]
100%|██████████| 5/5 [00:10<00:00,  2.11s/it]


Mean Score: 0.5601645088164776





In [120]:
model = ETC(100, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 3000, voting=True, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:11<00:00, 13.39it/s]


extracting the features...  

100%|██████████| 736/736 [00:23<00:00, 30.93it/s]
100%|██████████| 5/5 [00:03<00:00,  1.64it/s]


Mean Score: 0.5500278850433266





In [121]:
model = RFC(300, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 3000, voting=False, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:10<00:00, 14.76it/s]


extracting the features...  

100%|██████████| 736/736 [00:25<00:00, 29.24it/s]
100%|██████████| 5/5 [00:14<00:00,  2.91s/it]


Mean Score: 0.5116142981647722





In [122]:
model = RFC(300, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 3500, voting=True, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:10<00:00, 14.01it/s]


extracting the features...  

100%|██████████| 561/561 [00:20<00:00, 27.15it/s]
100%|██████████| 5/5 [00:12<00:00,  2.43s/it]


Mean Score: 0.518524531024531





In [123]:
model = RFC(600, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 3500, voting=True, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:10<00:00, 14.37it/s]


extracting the features...  

100%|██████████| 561/561 [00:21<00:00, 25.84it/s]
100%|██████████| 5/5 [00:28<00:00,  5.80s/it]


Mean Score: 0.5341630591630593





In [124]:
model = RFC(300, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 4000, voting=True, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:09<00:00, 16.20it/s]


extracting the features...  

100%|██████████| 414/414 [00:13<00:00, 30.10it/s]
100%|██████████| 5/5 [00:11<00:00,  2.24s/it]


Mean Score: 0.5634731939295846





In [125]:
model = RFC(100, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 4000, voting=True, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:10<00:00, 14.96it/s]


extracting the features...  

100%|██████████| 414/414 [00:13<00:00, 30.03it/s]
100%|██████████| 5/5 [00:03<00:00,  1.35it/s]


Mean Score: 0.5366579049844237





In [126]:
model = RFC(600, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 4000, voting=True, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:10<00:00, 13.77it/s]


extracting the features...  

100%|██████████| 414/414 [00:12<00:00, 32.60it/s]
100%|██████████| 5/5 [00:23<00:00,  4.76s/it]


Mean Score: 0.5626498894864164





In [127]:
model = ETC(300, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 4000, voting=True, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:11<00:00, 12.68it/s]


extracting the features...  

100%|██████████| 414/414 [00:13<00:00, 30.08it/s]
100%|██████████| 5/5 [00:06<00:00,  1.40s/it]


Mean Score: 0.5625066895525506





In [128]:
model = ETC(600, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 4000, voting=True, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:11<00:00, 13.35it/s]


extracting the features...  

100%|██████████| 414/414 [00:14<00:00, 28.30it/s]
100%|██████████| 5/5 [00:15<00:00,  3.16s/it]


Mean Score: 0.5838740449711969





In [129]:
model = ETC(800, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 4000, voting=True, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:12<00:00, 12.44it/s]


extracting the features...  

100%|██████████| 414/414 [00:13<00:00, 30.85it/s]
100%|██████████| 5/5 [00:20<00:00,  4.10s/it]


Mean Score: 0.5733421058928976





In [134]:
model = ETC(1500, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 4000, voting=True, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:10<00:00, 13.88it/s]


extracting the features...  

100%|██████████| 414/414 [00:17<00:00, 23.75it/s]
100%|██████████| 5/5 [00:46<00:00,  9.25s/it]


Mean Score: 0.5851606417184427





In [135]:
model = ETC(1500, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 4000, voting=False, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:10<00:00, 14.43it/s]


extracting the features...  

100%|██████████| 414/414 [00:15<00:00, 27.12it/s]
100%|██████████| 5/5 [00:40<00:00,  8.09s/it]


Mean Score: 0.5757194532188168





In [136]:
model = ETC(1500, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 4500, voting=False, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:10<00:00, 15.06it/s]


extracting the features...  

100%|██████████| 310/310 [00:09<00:00, 31.55it/s]
100%|██████████| 5/5 [00:29<00:00,  5.97s/it]


Mean Score: 0.565074572417814



