In [81]:
import glob
from datetime import datetime
import os

import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import mode
from scipy.fft import fft
from sklearn.ensemble import RandomForestClassifier as RFC, ExtraTreesClassifier as ETC
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score

from tqdm import tqdm, trange

# Baseline Leave One Out

In [15]:
def segmentation(df, overlap_rate, time_window):
    seg_data = []
    #convert overlap rate to step for sliding window
    overlap = int((1 - overlap_rate)*time_window)
    # interpolate
    df = df.interpolate().ffill().fillna(0)
    for i in range(0, len(df)-time_window+1, overlap):
        seg_data.append(df.loc[i:i+time_window-1, :].copy().reset_index(drop=True))
    return seg_data

In [16]:
def get_features(x_data):
    features = []
    cols = x_data.columns.tolist()
    #Calculate features (STD, Average, Max, Min, Median, Variance) for each data columns X Y Z 
    for k in cols:
        features.append(x_data[k].std(ddof=0))
        features.append(np.average(x_data[k]))
        features.append(np.max(x_data[k]))
        features.append(np.min(x_data[k]))
        features.append(np.median(x_data[k]))        
        features.append(np.var(x_data[k]))
    return features

In [17]:
def dataloader(overlap, window_size, verbose=True):
    if verbose:
        print("loading the data...", end="\t")
    data_list = []
    file_lengths = {1: [], 2: [], 3: []}
    files = tqdm(glob.glob("../TrainData/*/*/*.csv")) if verbose else glob.glob("../TrainData/*/*/*.csv")
    for file in files:
        tempdf = pd.read_csv(file)
        segmented_data = segmentation(tempdf, overlap, window_size)
        if len(segmented_data)>0:
            person = segmented_data[0].iloc[0, -2]
            file_lengths[person].append(len(segmented_data))   
        data_list.extend(segmented_data)
    return data_list, file_lengths

In [18]:
def feature_extractor(data_list, verbose=True):
    if verbose:
        print(f"extracting the features...", end="  ")
    X, y = {1:[], 2:[], 3:[]}, {1:[], 2:[], 3:[]}
    num_range = trange(0,len(data_list)) if verbose else range(0,len(data_list))
    for j in num_range:
        #extract only xyz columns
        person = data_list[j].loc[0, "subject_id"]
        x_data = data_list[j].drop(columns=["subject_id","activity"])
        X[person].append(get_features(x_data))
        y[person].append(data_list[j].reset_index(drop=True).loc[0, "activity"])
    return X, y

In [19]:
def majority_voting(predictions, file_lengths):
    filtered_predictions = []
    index = 0
    for length in file_lengths:
        file_pred = predictions[index:index+length]
        modes = mode(file_pred)
        majority_choice = modes.mode[0]
        filtered_predictions.extend([majority_choice]*length)
        index += length
    return filtered_predictions

In [79]:
def LOOCV_train_evaluate(model, overlap_rate, window_size, voting=True, n_repeats=1, verbose=True):
    scores = []
    data_list, file_lengths = dataloader(overlap_rate, window_size, verbose=verbose)
    X, y = feature_extractor(data_list, verbose=verbose)
    num_range = trange(n_repeats) if verbose else range(n_repeats)
    for _ in num_range:
        for p1, p2, p3 in [(1,2,3), (2,3,1), (3,1,2)]:
            X_test, y_test = X[p1], y[p1]
            X_train = X[p2] + X[p3]
            y_train = y[p2] + y[p3]
            # print(f"training model for person {p1}/3...", end="\t")
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            if voting:
                filtered_pred = majority_voting(pred, file_lengths[p1])
                scores.append(accuracy_score(y_test, filtered_pred))
            else:
                scores.append(accuracy_score(y_test, pred))
    if verbose:
        print(f"\nMean Score: {np.mean(scores)}")
    return scores

## Gridsearch

In [37]:
def GridSearch(parameters, csvpath = "..", filename="gridCV_results", n_repeats=7, verbose=False, progress=True):
    score_df = pd.DataFrame({"model": [], "window_size": [], "overlap_rate": [], "n_repeats":[], 
                            "avg_score": [], "scores":[]})
    models, window_sizes, overlap_rates = parameters["model"], parameters["window_size"], parameters["overlap_rate"]
    combinations = [(i,j,k) for i in models for j in window_sizes for k in overlap_rates]
    if progress:
        combinations = tqdm(combinations)
    for combination in combinations:
        model, overlap_rate, window_size = combination
        scores = LOOCV_train_evaluate(model, overlap_rate, window_size, n_repeats=n_repeats, verbose=verbose)
        score_df = score_df.append({"model": model.__str__(), "window_size": window_size, 
                    "overlap_rate": overlap_rate, "n_repeats": n_repeats, 
                    "avg_score": np.mean(scores), "scores": scores}, ignore_index=True)
    savepath = f"{csvpath}/{filename}_{str(datetime.now())[:-7]}.csv".replace(":", ".")
    score_df.to_csv(savepath, index=False)
    print(f"result exported to: {savepath}")


In [None]:
parameters = {
    "model": [RFC(300, n_jobs=-1), RFC(600, n_jobs=-1), RFC(1200, n_jobs=-1),
                ETC(300, n_jobs=-1), ETC(600, n_jobs=-1), ETC(1200, n_jobs=-1)],
    "window_size": [2000, 3000, 4000],
    "overlap_rate": [0.5, 0.75]
}
GridSearch(parameters)

## Tuning Experiments

In [96]:
model = RFC(300, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.5, 1000, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:10<00:00, 14.32it/s]


extracting the features...  

100%|██████████| 1667/1667 [00:55<00:00, 29.84it/s]
100%|██████████| 5/5 [00:30<00:00,  6.01s/it]



Mean Score: 0.3548648832961903


In [135]:
model = ETC(1500, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 4000, voting=False, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:10<00:00, 14.43it/s]


extracting the features...  

100%|██████████| 414/414 [00:15<00:00, 27.12it/s]
100%|██████████| 5/5 [00:40<00:00,  8.09s/it]


Mean Score: 0.5757194532188168





In [136]:
model = ETC(1500, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 4500, voting=False, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:10<00:00, 15.06it/s]


extracting the features...  

100%|██████████| 310/310 [00:09<00:00, 31.55it/s]
100%|██████████| 5/5 [00:29<00:00,  5.97s/it]


Mean Score: 0.565074572417814





# LOOCV with frequency domain features

In [10]:
def segmentation(df, overlap_rate, time_window):
    seg_data = []
    #convert overlap rate to step for sliding window
    overlap = int((1 - overlap_rate)*time_window)
    # interpolate
    df = df.interpolate().ffill().fillna(0)
    for i in range(0, len(df)-time_window+1, overlap):
        seg_data.append(df.loc[i:i+time_window-1, :].copy().reset_index(drop=True))
    return seg_data


def get_features(x_data):
    features = []
    cols = x_data.columns.tolist()
    #Calculate features (STD, Average, Max, Min, Median, Variance) for each data columns X Y Z 
    for k in cols:
        features.append(x_data[k].std(ddof=0))
        features.append(np.average(x_data[k]))
        features.append(np.max(x_data[k]))
        features.append(np.min(x_data[k]))
        features.append(np.median(x_data[k]))        
        features.append(np.var(x_data[k]))
        fd = np.abs(fft(np.array(x_data[k])))**2
        features.append(stats.skew(fd))
        features.append(stats.kurtosis(fd))

        features.append(fd.std(ddof=0))
        features.append(np.average(fd))
        features.append(np.max(fd))
        features.append(np.min(fd))
        features.append(np.median(fd))                                
        features.append(np.var(fd))
    return features


def dataloader(overlap, window_size, verbose=True):
    if verbose:
        print("loading the data...", end="\t")
    data_list = []
    file_lengths = {1: [], 2: [], 3: []}
    files = tqdm(glob.glob("../TrainData/*/*/*.csv")) if verbose else glob.glob("../TrainData/*/*/*.csv")
    for file in files:
        tempdf = pd.read_csv(file)
        segmented_data = segmentation(tempdf, overlap, window_size)
        if len(segmented_data)>0:
            person = segmented_data[0].iloc[0, -2]
            file_lengths[person].append(len(segmented_data))   
        data_list.extend(segmented_data)
    return data_list, file_lengths


def feature_extractor(data_list, verbose=True):
    if verbose:
        print(f"extracting the features...", end="  ")
    X, y = {1:[], 2:[], 3:[]}, {1:[], 2:[], 3:[]}
    num_range = trange(0,len(data_list)) if verbose else range(0,len(data_list))
    for j in num_range:
        #extract only xyz columns
        person = data_list[j].loc[0, "subject_id"]
        x_data = data_list[j].drop(columns=["subject_id","activity"])
        X[person].append(get_features(x_data))
        y[person].append(data_list[j].iloc[0, -1])
    return X, y


def majority_voting(predictions, file_lengths):
    filtered_predictions = []
    index = 0
    for length in file_lengths:
        file_pred = predictions[index:index+length]
        modes = mode(file_pred)
        majority_choice = modes.mode[0]
        filtered_predictions.extend([majority_choice]*length)
        index += length
    return filtered_predictions

In [11]:
def LOOCV_train_evaluate(model, overlap_rate, window_size, voting=True, n_repeats=1, verbose=True):
    scores = []
    data_list, file_lengths = dataloader(overlap_rate, window_size, verbose=verbose)
    X, y = feature_extractor(data_list, verbose=verbose)
    num_range = trange(n_repeats) if verbose else range(n_repeats)
    for _ in num_range:
        for p1, p2, p3 in [(1,2,3), (2,3,1), (3,1,2)]:
            X_test, y_test = X[p1], y[p1]
            X_train = X[p2] + X[p3]
            y_train = y[p2] + y[p3]
            # print(f"training model for person {p1}/3...", end="\t")
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            if voting:
                filtered_pred = majority_voting(pred, file_lengths[p1])
                scores.append(accuracy_score(y_test, filtered_pred))
            else:
                scores.append(accuracy_score(y_test, pred))
    if verbose:
        print(f"\nMean Score: {np.mean(scores)}")
    return scores

In [12]:
def GridSearch(parameters, csvpath = "..", n_repeats=7, verbose=False):
    score_df = pd.DataFrame({"model": [], "window_size": [], "overlap_rate": [], "n_repeats":[], 
                            "avg_score": [], "scores":[]})
    models = parameters["model"]
    for model in models:
        for window_size in parameters["window_size"]:
            for overlap_rate in parameters["overlap_rate"]:
                scores = LOOCV_train_evaluate(model, overlap_rate, window_size, n_repeats=n_repeats, verbose=verbose)
                score_df = score_df.append({"model": model.__str__(), "window_size": window_size, 
                            "overlap_rate": overlap_rate, "n_repeats": n_repeats, 
                            "avg_score": np.mean(scores), "scores": scores}, ignore_index=True)
    savepath = f"{csvpath}/grid_search_result_{str(datetime.now())[:-7]}.csv".replace(":", ".")
    score_df.to_csv(savepath, index=False)
    print(f"result exported to: {savepath}")


In [None]:
parameters = {
    "model": [RFC(300, n_jobs=-1), RFC(600, n_jobs=-1), RFC(1200, n_jobs=-1),
                ETC(300, n_jobs=-1), ETC(600, n_jobs=-1), ETC(1200, n_jobs=-1)],
    "window_size": [2000, 3000, 4000],
    "overlap_rate": [0.5, 0.75]
}
GridSearch(parameters)

## Tuning Experiments

In [8]:
model = RFC(600, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 4000, n_repeats=10)

loading the data...	

100%|██████████| 151/151 [00:09<00:00, 16.46it/s]


extracting the features...  

100%|██████████| 414/414 [00:24<00:00, 16.66it/s]
100%|██████████| 10/10 [00:42<00:00,  4.25s/it]


Mean Score: 0.5870291343610226





In [6]:
model = RFC(1500, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 4000, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:09<00:00, 15.30it/s]


extracting the features...  

100%|██████████| 414/414 [00:25<00:00, 16.42it/s]
100%|██████████| 5/5 [00:57<00:00, 11.41s/it]


Mean Score: 0.5743854041577472





In [10]:
model = ETC(300, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 4000, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:09<00:00, 16.33it/s]


extracting the features...  

100%|██████████| 414/414 [00:25<00:00, 16.34it/s]
100%|██████████| 5/5 [00:07<00:00,  1.56s/it]


Mean Score: 0.5752278254929603





In [11]:
model = ETC(1500, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 4000, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:09<00:00, 15.59it/s]


extracting the features...  

100%|██████████| 414/414 [00:25<00:00, 16.50it/s]
100%|██████████| 5/5 [00:38<00:00,  7.66s/it]


Mean Score: 0.5693867039976331





# LOOCV with speed, acc

In [None]:
def segmentation(df, overlap_rate, time_window):
    seg_data = []
    #convert overlap rate to step for sliding window
    overlap = int((1 - overlap_rate)*time_window)
    # interpolate
    df = df.interpolate().ffill().fillna(0)
    for i in range(0, len(df)-time_window+1, overlap):
        seg_data.append(df.loc[i:i+time_window-1, :].copy().reset_index(drop=True))
    return seg_data

def get_speed_acc(x_data):
    speed = x_data.diff().fillna(0)
    acc = speed.diff().fillna(0)
    speed.columns = [f"{col}_speed" for col in speed.columns]
    acc.columns = [f"{col}_acc" for col in acc.columns]
    return speed, acc

def get_streams(x_data):
    speed, acc = get_speed_acc(x_data)
    return speed, acc

def get_features(x_data):
    features = []
    cols = x_data.columns.tolist()
    #Calculate features (STD, Average, Max, Min, Median, Variance) for each data columns X Y Z 
    for k in cols:
        features.append(x_data[k].std(ddof=0))
        features.append(np.average(x_data[k]))
        features.append(np.max(x_data[k]))
        features.append(np.min(x_data[k]))
        features.append(np.median(x_data[k]))        
        features.append(np.var(x_data[k]))
        fd = np.abs(fft(np.array(x_data[k])))**2
        features.append(stats.skew(fd))
        features.append(stats.kurtosis(fd))
    return features


def dataloader(overlap, window_size, verbose=True):
    if verbose:
        print("loading the data...", end="\t")
    data_list = []
    file_lengths = {1: [], 2: [], 3: []}
    files = tqdm(glob.glob("../TrainData/*/*/*.csv")) if verbose else glob.glob("../TrainData/*/*/*.csv")
    for file in files:
        tempdf = pd.read_csv(file)
        segmented_data = segmentation(tempdf, overlap, window_size)
        if len(segmented_data)>0:
            person = segmented_data[0].reset_index(drop=True).loc[0, "subject_id"]
            file_lengths[person].append(len(segmented_data))   
        data_list.extend(segmented_data)
    return data_list, file_lengths


def feature_extractor(data_list, verbose=True):
    if verbose:
        print(f"extracting the features...", end="  ")
    X, y = {1:[], 2:[], 3:[]}, {1:[], 2:[], 3:[]}
    num_range = trange(0,len(data_list)) if verbose else range(0,len(data_list))
    for j in num_range:
        #extract only xyz columns
        person = data_list[j].loc[0, "subject_id"]
        x_data = data_list[j].drop(columns=["subject_id","activity"])
        X[person].append(get_features(x_data))
        y[person].append(data_list[j].iloc[0, -1])
    return X, y


def majority_voting(predictions, file_lengths):
    filtered_predictions = []
    index = 0
    for length in file_lengths:
        file_pred = predictions[index:index+length]
        modes = mode(file_pred)
        majority_choice = modes.mode[0]
        filtered_predictions.extend([majority_choice]*length)
        index += length
    return filtered_predictions

In [None]:
def LOOCV_train_evaluate(model, overlap_rate, window_size, voting=True, n_repeats=1, verbose=True):
    scores = []
    data_list, file_lengths = dataloader(overlap_rate, window_size, verbose=verbose)
    stream_list = []
    for df in data_list:
        stream_list.append(get_streams(df))
    X, y = feature_extractor(stream_list, verbose=verbose)
    num_range = trange(n_repeats) if verbose else range(n_repeats)
    for _ in num_range:
        for p1, p2, p3 in [(1,2,3), (2,3,1), (3,1,2)]:
            X_test, y_test = X[p1], y[p1]
            X_train = X[p2] + X[p3]
            y_train = y[p2] + y[p3]
            # print(f"training model for person {p1}/3...", end="\t")
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            if voting:
                filtered_pred = majority_voting(pred, file_lengths[p1])
                scores.append(accuracy_score(y_test, filtered_pred))
            else:
                scores.append(accuracy_score(y_test, pred))
    if verbose:
        print(f"\nMean Score: {np.mean(scores)}")
    return scores

In [None]:
def GridSearch(parameters, csvpath = "..", filename="gridCV_results", n_repeats=7, verbose=False, progress=True):
    score_df = pd.DataFrame({"model": [], "window_size": [], "overlap_rate": [], "n_repeats":[], 
                            "avg_score": [], "scores":[]})
    models, window_sizes, overlap_rates = parameters["model"], parameters["window_size"], parameters["overlap_rate"]
    combinations = [(i,j,k) for i in models for j in window_sizes for k in overlap_rates]
    if progress:
        combinations = tqdm(combinations)
    for combination in combinations:
        model, overlap_rate, window_size = combination
        scores = LOOCV_train_evaluate(model, overlap_rate, window_size, n_repeats=n_repeats, verbose=verbose)
        score_df = score_df.append({"model": model.__str__(), "window_size": window_size, 
                    "overlap_rate": overlap_rate, "n_repeats": n_repeats, 
                    "avg_score": np.mean(scores), "scores": scores}, ignore_index=True)
    savepath = f"{csvpath}/{filename}_{str(datetime.now())[:-7]}.csv".replace(":", ".")
    score_df.to_csv(savepath, index=False)
    print(f"result exported to: {savepath}")
