In [3]:
import glob
from datetime import datetime
import os

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy import stats, signal
from scipy.stats import mode
from scipy.fft import fft
from sklearn.ensemble import RandomForestClassifier as RFC, ExtraTreesClassifier as ETC
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score

from tqdm import tqdm, trange

In [71]:
def std_score(file):
    df = pd.read_csv(file)
    df.sort_values("avg_score", ascending=False, inplace=True)
    df["scores"] = df["scores"].apply(json.loads)
    df["std_score"] = df["scores"].apply(lambda x: np.array(x).std())
    return df


In [82]:
list(map(lambda x: x[:-2], stream_list[0].columns.tolist()[:39:3]))

['FH', 'TH', 'RH', 'RS', 'RO', 'RE', 'RW', 'LS', 'LE', 'LW', 'RA', 'LA', 'VS']

In [72]:

df = std_score("../12. again_with_freq_speed_acc_angles_distances_2021-08-16 15.42.02.csv")
df[["model", "window_size", "overlap_rate", "avg_score", "std_score"]]

Unnamed: 0,model,window_size,overlap_rate,avg_score,std_score
15,"ExtraTreesClassifier(n_estimators=1200, n_jobs...",4000.0,0.75,0.735882,0.091278
14,"ExtraTreesClassifier(n_estimators=1200, n_jobs...",4000.0,0.5,0.729047,0.105386
10,"ExtraTreesClassifier(n_estimators=600, n_jobs=-1)",4000.0,0.5,0.722271,0.098716
11,"ExtraTreesClassifier(n_estimators=600, n_jobs=-1)",4000.0,0.75,0.711079,0.087443
6,"RandomForestClassifier(n_estimators=1200, n_jo...",4000.0,0.5,0.702202,0.103027
12,"ExtraTreesClassifier(n_estimators=1200, n_jobs...",3000.0,0.5,0.698316,0.079047
8,"ExtraTreesClassifier(n_estimators=600, n_jobs=-1)",3000.0,0.5,0.693013,0.078084
4,"RandomForestClassifier(n_estimators=1200, n_jo...",3000.0,0.5,0.682441,0.075962
3,"RandomForestClassifier(n_estimators=600, n_job...",4000.0,0.75,0.672543,0.069759
0,"RandomForestClassifier(n_estimators=600, n_job...",3000.0,0.5,0.67237,0.085229


# Baseline Leave One Out

In [15]:
def segmentation(df, overlap_rate, time_window):
    seg_data = []
    #convert overlap rate to step for sliding window
    overlap = int((1 - overlap_rate)*time_window)
    # interpolate
    df = df.interpolate().ffill().fillna(0)
    for i in range(0, len(df)-time_window+1, overlap):
        seg_data.append(df.loc[i:i+time_window-1, :].copy().reset_index(drop=True))
    return seg_data

In [16]:
def get_features(x_data):
    features = []
    cols = x_data.columns.tolist()
    #Calculate features (STD, Average, Max, Min, Median, Variance) for each data columns X Y Z 
    for k in cols:
        features.append(x_data[k].std(ddof=0))
        features.append(np.average(x_data[k]))
        features.append(np.max(x_data[k]))
        features.append(np.min(x_data[k]))
        features.append(np.median(x_data[k]))        
        features.append(np.var(x_data[k]))
    return features

In [17]:
def dataloader(overlap, window_size, verbose=True):
    if verbose:
        print("loading the data...", end="\t")
    data_list = []
    file_lengths = {1: [], 2: [], 3: []}
    files = tqdm(glob.glob("../TrainData/*/*/*.csv")) if verbose else glob.glob("../TrainData/*/*/*.csv")
    for file in files:
        tempdf = pd.read_csv(file)
        segmented_data = segmentation(tempdf, overlap, window_size)
        if len(segmented_data)>0:
            person = segmented_data[0].iloc[0, -2]
            file_lengths[person].append(len(segmented_data))   
        data_list.extend(segmented_data)
    return data_list, file_lengths

In [18]:
def feature_extractor(data_list, verbose=True):
    if verbose:
        print(f"extracting the features...", end="  ")
    X, y = {1:[], 2:[], 3:[]}, {1:[], 2:[], 3:[]}
    num_range = trange(0,len(data_list)) if verbose else range(0,len(data_list))
    for j in num_range:
        #extract only xyz columns
        person = data_list[j].loc[0, "subject_id"]
        x_data = data_list[j].drop(columns=["subject_id","activity"])
        X[person].append(get_features(x_data))
        y[person].append(data_list[j].reset_index(drop=True).loc[0, "activity"])
    return X, y

In [19]:
def majority_voting(predictions, file_lengths):
    filtered_predictions = []
    index = 0
    for length in file_lengths:
        file_pred = predictions[index:index+length]
        modes = mode(file_pred)
        majority_choice = modes.mode[0]
        filtered_predictions.extend([majority_choice]*length)
        index += length
    return filtered_predictions

In [79]:
def LOOCV_train_evaluate(model, overlap_rate, window_size, voting=True, n_repeats=1, verbose=True):
    scores = []
    data_list, file_lengths = dataloader(overlap_rate, window_size, verbose=verbose)
    X, y = feature_extractor(data_list, verbose=verbose)
    num_range = trange(n_repeats) if verbose else range(n_repeats)
    for _ in num_range:
        for p1, p2, p3 in [(1,2,3), (2,3,1), (3,1,2)]:
            X_test, y_test = X[p1], y[p1]
            X_train = X[p2] + X[p3]
            y_train = y[p2] + y[p3]
            # print(f"training model for person {p1}/3...", end="\t")
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            if voting:
                filtered_pred = majority_voting(pred, file_lengths[p1])
                scores.append(accuracy_score(y_test, filtered_pred))
            else:
                scores.append(accuracy_score(y_test, pred))
    if verbose:
        print(f"\nMean Score: {np.mean(scores)}")
    return scores

## Gridsearch

In [37]:
def GridSearch(parameters, csvpath = "..", filename="gridCV_results", n_repeats=7, verbose=False, progress=True):
    score_df = pd.DataFrame({"model": [], "window_size": [], "overlap_rate": [], "n_repeats":[], 
                            "avg_score": [], "scores":[]})
    models, window_sizes, overlap_rates = parameters["model"], parameters["window_size"], parameters["overlap_rate"]
    combinations = [(i,j,k) for i in models for j in window_sizes for k in overlap_rates]
    if progress:
        combinations = tqdm(combinations)
    for combination in combinations:
        model, overlap_rate, window_size = combination
        scores = LOOCV_train_evaluate(model, overlap_rate, window_size, n_repeats=n_repeats, verbose=verbose)
        score_df = score_df.append({"model": model.__str__(), "window_size": window_size, 
                    "overlap_rate": overlap_rate, "n_repeats": n_repeats, 
                    "avg_score": np.mean(scores), "scores": scores}, ignore_index=True)
    savepath = f"{csvpath}/{filename}_{str(datetime.now())[:-7]}.csv".replace(":", ".")
    score_df.to_csv(savepath, index=False)
    print(f"result exported to: {savepath}")


In [None]:
parameters = {
    "model": [RFC(300, n_jobs=-1), RFC(600, n_jobs=-1), RFC(1200, n_jobs=-1),
                ETC(300, n_jobs=-1), ETC(600, n_jobs=-1), ETC(1200, n_jobs=-1)],
    "window_size": [2000, 3000, 4000],
    "overlap_rate": [0.5, 0.75]
}
GridSearch(parameters)

## Tuning Experiments

In [96]:
model = RFC(300, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.5, 1000, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:10<00:00, 14.32it/s]


extracting the features...  

100%|██████████| 1667/1667 [00:55<00:00, 29.84it/s]
100%|██████████| 5/5 [00:30<00:00,  6.01s/it]



Mean Score: 0.3548648832961903


In [135]:
model = ETC(1500, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 4000, voting=False, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:10<00:00, 14.43it/s]


extracting the features...  

100%|██████████| 414/414 [00:15<00:00, 27.12it/s]
100%|██████████| 5/5 [00:40<00:00,  8.09s/it]


Mean Score: 0.5757194532188168





# LOOCV with frequency domain features

In [10]:
def segmentation(df, overlap_rate, time_window):
    seg_data = []
    #convert overlap rate to step for sliding window
    overlap = int((1 - overlap_rate)*time_window)
    # interpolate
    df = df.interpolate().ffill().fillna(0)
    for i in range(0, len(df)-time_window+1, overlap):
        seg_data.append(df.loc[i:i+time_window-1, :].copy().reset_index(drop=True))
    return seg_data


def get_features(x_data):
    features = []
    cols = x_data.columns.tolist()
    #Calculate features (STD, Average, Max, Min, Median, Variance) for each data columns X Y Z 
    for k in cols:
        features.append(x_data[k].std(ddof=0))
        features.append(np.average(x_data[k]))
        features.append(np.max(x_data[k]))
        features.append(np.min(x_data[k]))
        features.append(np.median(x_data[k]))        
        features.append(np.var(x_data[k]))
        fd = np.abs(fft(np.array(x_data[k])))**2
        features.append(stats.skew(fd))
        features.append(stats.kurtosis(fd))

        features.append(fd.std(ddof=0))
        features.append(np.average(fd))
        features.append(np.max(fd))
        features.append(np.min(fd))
        features.append(np.median(fd))                                
        features.append(np.var(fd))
    return features


def dataloader(overlap, window_size, verbose=True):
    if verbose:
        print("loading the data...", end="\t")
    data_list = []
    file_lengths = {1: [], 2: [], 3: []}
    files = tqdm(glob.glob("../TrainData/*/*/*.csv")) if verbose else glob.glob("../TrainData/*/*/*.csv")
    for file in files:
        tempdf = pd.read_csv(file)
        segmented_data = segmentation(tempdf, overlap, window_size)
        if len(segmented_data)>0:
            person = segmented_data[0].iloc[0, -2]
            file_lengths[person].append(len(segmented_data))   
        data_list.extend(segmented_data)
    return data_list, file_lengths


def feature_extractor(data_list, verbose=True):
    if verbose:
        print(f"extracting the features...", end="  ")
    X, y = {1:[], 2:[], 3:[]}, {1:[], 2:[], 3:[]}
    num_range = trange(0,len(data_list)) if verbose else range(0,len(data_list))
    for j in num_range:
        #extract only xyz columns
        person = data_list[j].loc[0, "subject_id"]
        x_data = data_list[j].drop(columns=["subject_id","activity"])
        X[person].append(get_features(x_data))
        y[person].append(data_list[j].iloc[0, -1])
    return X, y


def majority_voting(predictions, file_lengths):
    filtered_predictions = []
    index = 0
    for length in file_lengths:
        file_pred = predictions[index:index+length]
        modes = mode(file_pred)
        majority_choice = modes.mode[0]
        filtered_predictions.extend([majority_choice]*length)
        index += length
    return filtered_predictions

In [11]:
def LOOCV_train_evaluate(model, overlap_rate, window_size, voting=True, n_repeats=1, verbose=True):
    scores = []
    data_list, file_lengths = dataloader(overlap_rate, window_size, verbose=verbose)
    X, y = feature_extractor(data_list, verbose=verbose)
    num_range = trange(n_repeats) if verbose else range(n_repeats)
    for _ in num_range:
        for p1, p2, p3 in [(1,2,3), (2,3,1), (3,1,2)]:
            X_test, y_test = X[p1], y[p1]
            X_train = X[p2] + X[p3]
            y_train = y[p2] + y[p3]
            # print(f"training model for person {p1}/3...", end="\t")
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            if voting:
                filtered_pred = majority_voting(pred, file_lengths[p1])
                scores.append(accuracy_score(y_test, filtered_pred))
            else:
                scores.append(accuracy_score(y_test, pred))
    if verbose:
        print(f"\nMean Score: {np.mean(scores)}")
    return scores

In [12]:
def GridSearch(parameters, csvpath = "..", n_repeats=7, verbose=False):
    score_df = pd.DataFrame({"model": [], "window_size": [], "overlap_rate": [], "n_repeats":[], 
                            "avg_score": [], "scores":[]})
    models = parameters["model"]
    for model in models:
        for window_size in parameters["window_size"]:
            for overlap_rate in parameters["overlap_rate"]:
                scores = LOOCV_train_evaluate(model, overlap_rate, window_size, n_repeats=n_repeats, verbose=verbose)
                score_df = score_df.append({"model": model.__str__(), "window_size": window_size, 
                            "overlap_rate": overlap_rate, "n_repeats": n_repeats, 
                            "avg_score": np.mean(scores), "scores": scores}, ignore_index=True)
    savepath = f"{csvpath}/grid_search_result_{str(datetime.now())[:-7]}.csv".replace(":", ".")
    score_df.to_csv(savepath, index=False)
    print(f"result exported to: {savepath}")


In [None]:
parameters = {
    "model": [RFC(300, n_jobs=-1), RFC(600, n_jobs=-1), RFC(1200, n_jobs=-1),
                ETC(300, n_jobs=-1), ETC(600, n_jobs=-1), ETC(1200, n_jobs=-1)],
    "window_size": [2000, 3000, 4000],
    "overlap_rate": [0.5, 0.75]
}
GridSearch(parameters)

## Tuning Experiments

In [8]:
model = RFC(600, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 4000, n_repeats=10)

loading the data...	

100%|██████████| 151/151 [00:09<00:00, 16.46it/s]


extracting the features...  

100%|██████████| 414/414 [00:24<00:00, 16.66it/s]
100%|██████████| 10/10 [00:42<00:00,  4.25s/it]


Mean Score: 0.5870291343610226





In [6]:
model = RFC(1500, n_jobs=-1)
scores = LOOCV_train_evaluate(model, 0.75, 4000, n_repeats=5)

loading the data...	

100%|██████████| 151/151 [00:09<00:00, 15.30it/s]


extracting the features...  

100%|██████████| 414/414 [00:25<00:00, 16.42it/s]
100%|██████████| 5/5 [00:57<00:00, 11.41s/it]


Mean Score: 0.5743854041577472





# LOOCV with speed, acceleration

In [5]:
def segmentation(df, overlap_rate, time_window):
    seg_data = []
    #convert overlap rate to step for sliding window
    overlap = int((1 - overlap_rate)*time_window)
    # interpolate
    df = df.interpolate().ffill().fillna(0)
    for i in range(0, len(df)-time_window+1, overlap):
        seg_data.append(df.loc[i:i+time_window-1, :].copy().reset_index(drop=True))
    return seg_data

def get_speed_acc(x_data):
    x_data = x_data.drop(columns=["activity", "subject_id"])
    speed = x_data.diff().fillna(0)
    acc = speed.diff().fillna(0)
    speed.columns = [f"{col}_speed" for col in speed.columns]
    acc.columns = [f"{col}_acc" for col in acc.columns]
    return speed, acc
    
def get_streams(x_data):
    speed, acc = get_speed_acc(x_data)
    x_data = pd.concat([x_data, speed, acc], axis=1)
    return x_data

def get_features(x_data):
    features = []
    cols = x_data.columns.tolist()
    #Calculate features (STD, Average, Max, Min, Median, Variance) for each data columns X Y Z 
    for k in cols:
        features.append(x_data[k].std(ddof=0))
        features.append(np.average(x_data[k]))
        features.append(np.max(x_data[k]))
        features.append(np.min(x_data[k]))
        features.append(np.median(x_data[k]))        
        features.append(np.var(x_data[k]))
        # fd = np.abs(fft(np.array(x_data[k])))**2
        # features.append(stats.skew(fd))
        # features.append(stats.kurtosis(fd))
    return features


def dataloader(overlap, window_size, verbose=True):
    if verbose:
        print("loading the data...", end="\t")
    data_list = []
    file_lengths = {1: [], 2: [], 3: []}
    files = tqdm(glob.glob("../TrainData/*/*/*.csv")) if verbose else glob.glob("../TrainData/*/*/*.csv")
    for file in files:
        tempdf = pd.read_csv(file)
        segmented_data = segmentation(tempdf, overlap, window_size)
        if len(segmented_data)>0:
            person = segmented_data[0].reset_index(drop=True).loc[0, "subject_id"]
            file_lengths[person].append(len(segmented_data))   
        data_list.extend(segmented_data)
    return data_list, file_lengths


def feature_extractor(data_list, verbose=True):
    if verbose:
        print(f"extracting the features...", end="  ")
    X, y = {1:[], 2:[], 3:[]}, {1:[], 2:[], 3:[]}
    num_range = trange(0,len(data_list)) if verbose else range(0,len(data_list))
    for j in num_range:
        #extract only xyz columns
        person = data_list[j].loc[0, "subject_id"]
        x_data = data_list[j].drop(columns=["subject_id","activity"])
        X[person].append(get_features(x_data))
        y[person].append(data_list[j].reset_index(drop=True).loc[0, "activity"])
    return X, y


def majority_voting(predictions, file_lengths):
    filtered_predictions = []
    index = 0
    for length in file_lengths:
        file_pred = predictions[index:index+length]
        modes = mode(file_pred)
        majority_choice = modes.mode[0]
        filtered_predictions.extend([majority_choice]*length)
        index += length
    return filtered_predictions

In [6]:
def LOOCV_train_evaluate(model, overlap_rate, window_size, voting=True, n_repeats=1, verbose=True):
    scores = []
    data_list, file_lengths = dataloader(overlap_rate, window_size, verbose=verbose)
    stream_list = []
    for df in data_list:
        stream_list.append(get_streams(df))
    X, y = feature_extractor(stream_list, verbose=verbose)
    num_range = trange(n_repeats) if verbose else range(n_repeats)
    for _ in num_range:
        for p1, p2, p3 in [(1,2,3), (2,3,1), (3,1,2)]:
            X_test, y_test = X[p1], y[p1]
            X_train = X[p2] + X[p3]
            y_train = y[p2] + y[p3]
            # print(f"training model for person {p1}/3...", end="\t")
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            if voting:
                filtered_pred = majority_voting(pred, file_lengths[p1])
                scores.append(accuracy_score(y_test, filtered_pred))
            else:
                scores.append(accuracy_score(y_test, pred))
    if verbose:
        print(f"\nMean Score: {np.mean(scores)}")
    return scores

In [7]:
def GridSearch(parameters, csvpath = "..", filename="gridCV_results", n_repeats=7, verbose=False, progress=True):
    score_df = pd.DataFrame({"model": [], "window_size": [], "overlap_rate": [], "n_repeats":[], 
                            "avg_score": [], "scores":[]})
    models, window_sizes, overlap_rates = parameters["model"], parameters["window_size"], parameters["overlap_rate"]
    combinations = [(i,j,k) for i in models for j in window_sizes for k in overlap_rates]
    if progress:
        combinations = tqdm(combinations)
    for combination in combinations:
        model, window_size, overlap_rate = combination
        scores = LOOCV_train_evaluate(model, overlap_rate, window_size, n_repeats=n_repeats, verbose=verbose)
        score_df = score_df.append({"model": model.__str__(), "window_size": window_size, 
                    "overlap_rate": overlap_rate, "n_repeats": n_repeats, 
                    "avg_score": np.mean(scores), "scores": scores}, ignore_index=True)
    savepath = f"{csvpath}/{filename}_{str(datetime.now())[:-7]}.csv".replace(":", ".")
    score_df.to_csv(savepath, index=False)
    print(f"result exported to: {savepath}")


In [None]:
parameters = {
    "model": [RFC(300, n_jobs=-1), RFC(600, n_jobs=-1), RFC(1200, n_jobs=-1),
                ETC(300, n_jobs=-1), ETC(600, n_jobs=-1), ETC(1200, n_jobs=-1)],
    "window_size": [3000, 4000],
    "overlap_rate": [0.5, 0.75]
}
GridSearch(parameters)

# LOOCV with frequency, speed, acceleration

In [42]:
def segmentation(df, overlap_rate, time_window):
    seg_data = []
    #convert overlap rate to step for sliding window
    overlap = int((1 - overlap_rate)*time_window)
    # interpolate
    df = df.interpolate().ffill().fillna(0)
    for i in range(0, len(df)-time_window+1, overlap):
        seg_data.append(df.loc[i:i+time_window-1, :].copy().reset_index(drop=True))
    return seg_data

def get_speed_acc(x_data):
    x_data = x_data.drop(columns=["activity", "subject_id"])
    speed = x_data.diff().fillna(0)
    acc = speed.diff().fillna(0)
    speed.columns = [f"{col}_speed" for col in speed.columns]
    acc.columns = [f"{col}_acc" for col in acc.columns]
    return speed, acc

def get_streams(x_data):
    speed, acc = get_speed_acc(x_data)
    x_data = pd.concat([x_data, speed, acc], axis=1)
    return x_data

def get_features(x_data):
    features = []
    cols = x_data.columns.tolist()
    #Calculate features (STD, Average, Max, Min, Median, Variance) for each data columns X Y Z 
    for k in cols:
        features.append(x_data[k].std(ddof=0))
        features.append(np.average(x_data[k]))
        features.append(np.max(x_data[k]))
        features.append(np.min(x_data[k]))
        features.append(np.median(x_data[k]))        
        features.append(np.var(x_data[k]))
        fd = np.abs(fft(np.array(x_data[k])))**2
        features.append(stats.skew(fd))
        features.append(stats.kurtosis(fd))
    return features


def dataloader(overlap, window_size, verbose=True):
    if verbose:
        print("loading the data...", end="\t")
    data_list = []
    file_lengths = {1: [], 2: [], 3: []}
    files = tqdm(glob.glob("../TrainData/*/*/*.csv")) if verbose else glob.glob("../TrainData/*/*/*.csv")
    for file in files:
        tempdf = pd.read_csv(file)
        segmented_data = segmentation(tempdf, overlap, window_size)
        if len(segmented_data)>0:
            person = segmented_data[0].reset_index(drop=True).loc[0, "subject_id"]
            file_lengths[person].append(len(segmented_data))   
        data_list.extend(segmented_data)
    return data_list, file_lengths


def feature_extractor(data_list, verbose=True):
    if verbose:
        print(f"extracting the features...", end="  ")
    X, y = {1:[], 2:[], 3:[]}, {1:[], 2:[], 3:[]}
    num_range = trange(0,len(data_list)) if verbose else range(0,len(data_list))
    for j in num_range:
        #extract only xyz columns
        person = data_list[j].loc[0, "subject_id"]
        x_data = data_list[j].drop(columns=["subject_id","activity"])
        X[person].append(get_features(x_data))
        y[person].append(data_list[j].reset_index(drop=True).loc[0, "activity"])
    return X, y


def majority_voting(predictions, file_lengths):
    filtered_predictions = []
    index = 0
    for length in file_lengths:
        file_pred = predictions[index:index+length]
        modes = mode(file_pred)
        majority_choice = modes.mode[0]
        filtered_predictions.extend([majority_choice]*length)
        index += length
    return filtered_predictions

In [43]:
def LOOCV_train_evaluate(model, overlap_rate, window_size, voting=True, n_repeats=1, verbose=True):
    scores = []
    data_list, file_lengths = dataloader(overlap_rate, window_size, verbose=verbose)
    stream_list = []
    for df in data_list:
        stream_list.append(get_streams(df))
    X, y = feature_extractor(stream_list, verbose=verbose)
    num_range = trange(n_repeats) if verbose else range(n_repeats)
    for _ in num_range:
        for p1, p2, p3 in [(1,2,3), (2,3,1), (3,1,2)]:
            X_test, y_test = X[p1], y[p1]
            X_train = X[p2] + X[p3]
            y_train = y[p2] + y[p3]
            # print(f"training model for person {p1}/3...", end="\t")
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            if voting:
                filtered_pred = majority_voting(pred, file_lengths[p1])
                scores.append(accuracy_score(y_test, filtered_pred))
            else:
                scores.append(accuracy_score(y_test, pred))
    if verbose:
        print(f"\nMean Score: {np.mean(scores)}")
    return scores

In [44]:
def GridSearch(parameters, csvpath = "..", filename="gridCV_results", n_repeats=7, verbose=False, progress=True):
    score_df = pd.DataFrame({"model": [], "window_size": [], "overlap_rate": [], "n_repeats":[], 
                            "avg_score": [], "scores":[]})
    models, window_sizes, overlap_rates = parameters["model"], parameters["window_size"], parameters["overlap_rate"]
    combinations = [(i,j,k) for i in models for j in window_sizes for k in overlap_rates]
    if progress:
        combinations = tqdm(combinations)
    for combination in combinations:
        model, window_size, overlap_rate = combination
        scores = LOOCV_train_evaluate(model, overlap_rate, window_size, n_repeats=n_repeats, verbose=verbose)
        score_df = score_df.append({"model": model.__str__(), "window_size": window_size, 
                    "overlap_rate": overlap_rate, "n_repeats": n_repeats, 
                    "avg_score": np.mean(scores), "scores": scores}, ignore_index=True)
    savepath = f"{csvpath}/{filename}_{str(datetime.now())[:-7]}.csv".replace(":", ".")
    score_df.to_csv(savepath, index=False)
    print(f"result exported to: {savepath}")


In [None]:
parameters = {
    "model": [RFC(300, n_jobs=-1), RFC(600, n_jobs=-1), RFC(1200, n_jobs=-1),
                ETC(300, n_jobs=-1), ETC(600, n_jobs=-1), ETC(1200, n_jobs=-1)],
    "window_size": [2000, 3000, 4000],
    "overlap_rate": [0.5, 0.75]
}
GridSearch(parameters)

In [45]:
model = ETC(1200, n_jobs=-1)
data_list, file_lengths = dataloader(0.5, 4000, verbose=True)
stream_list = []
for df in data_list:
    stream_list.append(get_streams(df))
X, y = feature_extractor(stream_list, verbose=True)
p1, p2, p3 = 1,2,3
X_test, y_test = X[p1], y[p1]
X_train = X[p2] + X[p3]
y_train = y[p2] + y[p3]
# print(f"training model for person {p1}/3...", end="\t")
model.fit(X_train, y_train)
pred = model.predict(X_test)
accuracy_score(pred, y_test)

loading the data...	

100%|██████████| 151/151 [00:10<00:00, 13.99it/s]


extracting the features...  

100%|██████████| 240/240 [01:03<00:00,  3.79it/s]


0.6962025316455697

In [70]:
cols = stream_list[0].columns.tolist()
cols.remove("activity")
cols.remove("subject_id")
print(len(cols))
colnames = []
for c in tqdm(cols):
    for suffix in ["_std", "avg", "max", "min", "med", "var", "skew", "kurt"]:
        colnames.append(c+suffix)
print(len(colnames))

117


100%|██████████| 117/117 [00:00<00:00, 10663.25it/s]

936





# LOOCV with frequency, speed, acceleration, joint distances

In [11]:
def joint_distance(x_data, joint1, joint2):
    """
    returns the distance between two joints. 
    """
    x1, y1, z1 = x_data[f"{joint1}_X"], x_data[f"{joint1}_Y"], x_data[f"{joint1}_Z"]
    x2, y2, z2 = x_data[f"{joint2}_X"], x_data[f"{joint2}_Y"], x_data[f"{joint2}_Z"]
    distance = np.sqrt((x2-x1)**2 + (y2-y1)**2 + (z2-z1)**2)
    return distance

def get_all_joint_distances(x_data):
    """
    calculates all the necessary joint distances from the `x_data`, 
    adds columns to it and returns the modified `x_data`.
    the two joints should not be essentially consecutive, 
    because the distance between two consecutive joints is always constant.
    For example, distance between left_wrist and left_elbow is always constant.
    """
    # joints
    # Front head        ->  left shoulder       (1->8)
    x_data["dist_FH_LS"] = joint_distance(x_data, "FH", "LS")
    # Front head        ->  right shoulder      (1->4)
    x_data["dist_FH_RS"] = joint_distance(x_data, "FH", "RS")
    # left shoulder     ->  left wrist          (8->10)
    x_data["dist_LS_LW"] = joint_distance(x_data, "LS", "LW")
    # right shoulder    ->  right wrist         (4->7)
    x_data["dist_RS_RW"] = joint_distance(x_data, "RS", "RW")
    # v sacral          ->  left elbow          (13->9)
    x_data["dist_VS_LE"] = joint_distance(x_data, "VS", "LE")
    # v sacral          ->  right elbow         (13->6)
    x_data["dist_VS_RE"] = joint_distance(x_data, "VS", "RE")
    # v sacral          ->  left wrist          (13->10)
    x_data["dist_VS_LW"] = joint_distance(x_data, "VS", "LW")
    # v sacral          ->  right wrist         (13->7)
    x_data["dist_VS_RW"] = joint_distance(x_data, "VS", "RW")
    # v sacral          ->  rear head           (13->3)
    x_data["dist_VS_RH"] = joint_distance(x_data, "VS", "RH")
    # v sacral          ->  top head            (13->2)
    x_data["dist_VS_TH"] = joint_distance(x_data, "VS", "TH")
    # left wrist        ->  right wrist         (10->7)
    x_data["dist_LW_RW"] = joint_distance(x_data, "LW", "RW")
    # left asis         ->  left wrist          (12->10)
    x_data["dist_LA_LW"] = joint_distance(x_data, "LA", "LW")
    # right asis        ->  right wrist         (11->7)
    x_data["dist_RA_RW"] = joint_distance(x_data, "RA", "RW")
    # left wrist        ->  top head            (10->2)
    x_data["dist_LW_TH"] = joint_distance(x_data, "LW", "TH")
    # right wrist       ->  top head            (7->2)
    x_data["dist_RW_TH"] = joint_distance(x_data, "RW", "TH")
    # top head          ->  left asis           (2->12)
    x_data["dist_TH_LA"] = joint_distance(x_data, "TH", "LA")
    return x_data

In [12]:
def segmentation(df, overlap_rate, time_window):
    seg_data = []
    #convert overlap rate to step for sliding window
    overlap = int((1 - overlap_rate)*time_window)
    # interpolate
    df = df.interpolate().ffill().fillna(0)
    for i in range(0, len(df)-time_window+1, overlap):
        seg_data.append(df.loc[i:i+time_window-1, :].copy().reset_index(drop=True))
    return seg_data

def get_speed_acc(x_data):
    x_data = x_data.drop(columns=["activity", "subject_id"])
    speed = x_data.diff().fillna(0)
    acc = speed.diff().fillna(0)
    speed.columns = [f"{col}_speed" for col in speed.columns]
    acc.columns = [f"{col}_acc" for col in acc.columns]
    return speed, acc

def get_streams(x_data):
    speed, acc = get_speed_acc(x_data)
    x_data = pd.concat([x_data, speed, acc], axis=1)
    x_data = get_all_joint_distances(x_data)
    return x_data

def get_features(x_data):
    features = []
    cols = x_data.columns.tolist()
    #Calculate features (STD, Average, Max, Min, Median, Variance) for each data columns X Y Z 
    for k in cols:
        features.append(x_data[k].std(ddof=0))
        features.append(np.average(x_data[k]))
        features.append(np.max(x_data[k]))
        features.append(np.min(x_data[k]))
        features.append(np.median(x_data[k]))        
        features.append(np.var(x_data[k]))
        fd = np.abs(fft(np.array(x_data[k])))**2
        features.append(stats.skew(fd))
        features.append(stats.kurtosis(fd))
    return features

def rename_columns(df):
    df.columns = [
        "FH_X", "FH_Y", "FH_Z",     #1
        "TH_X", "TH_Y", "TH_Z",     #2
        "RH_X", "RH_Y", "RH_Z",     #3
        "RS_X", "RS_Y", "RS_Z",     #4
        "RO_X", "RO_Y", "RO_Z",     #5
        "RE_X", "RE_Y", "RE_Z",     #6
        "RW_X", "RW_Y", "RW_Z",     #7
        "LS_X", "LS_Y", "LS_Z",     #8
        "LE_X", "LE_Y", "LE_Z",     #9
        "LW_X", "LW_Y", "LW_Z",     #10
        "RA_X", "RA_Y", "RA_Z",     #11
        "LA_X", "LA_Y", "LA_Z",     #12
        "VS_X", "VS_Y", "VS_Z",     #13
        "subject_id", "activity",   # Other columns
    ]
    return df

def dataloader(overlap, window_size, verbose=True):
    if verbose:
        print("loading the data...", end="\t")
    data_list = []
    file_lengths = {1: [], 2: [], 3: []}
    files = tqdm(glob.glob("../TrainData/*/*/*.csv")) if verbose else glob.glob("../TrainData/*/*/*.csv")
    for file in files:
        tempdf = pd.read_csv(file)
        tempdf = rename_columns(tempdf)
        segmented_data = segmentation(tempdf, overlap, window_size)
        if len(segmented_data)>0:
            person = segmented_data[0].reset_index(drop=True).loc[0, "subject_id"]
            file_lengths[person].append(len(segmented_data))   
        data_list.extend(segmented_data)
    return data_list, file_lengths


def feature_extractor(data_list, verbose=True):
    if verbose:
        print(f"extracting the features...", end="  ")
    X, y = {1:[], 2:[], 3:[]}, {1:[], 2:[], 3:[]}
    num_range = trange(0,len(data_list)) if verbose else range(0,len(data_list))
    for j in num_range:
        #extract only xyz columns
        person = data_list[j].loc[0, "subject_id"]
        x_data = data_list[j].drop(columns=["subject_id","activity"])
        X[person].append(get_features(x_data))
        y[person].append(data_list[j].reset_index(drop=True).loc[0, "activity"])
    return X, y


def majority_voting(predictions, file_lengths):
    filtered_predictions = []
    index = 0
    for length in file_lengths:
        file_pred = predictions[index:index+length]
        modes = mode(file_pred)
        majority_choice = modes.mode[0]
        filtered_predictions.extend([majority_choice]*length)
        index += length
    return filtered_predictions

In [14]:
def LOOCV_train_evaluate(model, overlap_rate, window_size, voting=True, n_repeats=1, verbose=True):
    scores = []
    data_list, file_lengths = dataloader(overlap_rate, window_size, verbose=verbose)
    stream_list = []
    for df in data_list:
        stream_list.append(get_streams(df))
    X, y = feature_extractor(stream_list, verbose=verbose)
    num_range = trange(n_repeats) if verbose else range(n_repeats)
    for _ in num_range:
        for p1, p2, p3 in [(1,2,3), (2,3,1), (3,1,2)]:
            X_test, y_test = X[p1], y[p1]
            X_train = X[p2] + X[p3]
            y_train = y[p2] + y[p3]
            # print(f"training model for person {p1}/3...", end="\t")
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            if voting:
                filtered_pred = majority_voting(pred, file_lengths[p1])
                scores.append(accuracy_score(y_test, filtered_pred))
            else:
                scores.append(accuracy_score(y_test, pred))
    if verbose:
        print(f"\nMean Score: {np.mean(scores)}")
    return scores

In [15]:
def GridSearch(parameters, csvpath = "..", filename="gridCV_results", n_repeats=7, verbose=False, progress=True):
    score_df = pd.DataFrame({"model": [], "window_size": [], "overlap_rate": [], "n_repeats":[], 
                            "avg_score": [], "scores":[]})
    models, window_sizes, overlap_rates = parameters["model"], parameters["window_size"], parameters["overlap_rate"]
    combinations = [(i,j,k) for i in models for j in window_sizes for k in overlap_rates]
    if progress:
        combinations = tqdm(combinations)
    for combination in combinations:
        model, window_size, overlap_rate = combination
        scores = LOOCV_train_evaluate(model, overlap_rate, window_size, n_repeats=n_repeats, verbose=verbose)
        score_df = score_df.append({"model": model.__str__(), "window_size": window_size, 
                    "overlap_rate": overlap_rate, "n_repeats": n_repeats, 
                    "avg_score": np.mean(scores), "scores": scores}, ignore_index=True)
    savepath = f"{csvpath}/{filename}_{str(datetime.now())[:-7]}.csv".replace(":", ".")
    score_df.to_csv(savepath, index=False)
    print(f"result exported to: {savepath}")


In [16]:
parameters = {
    "model": [RFC(600, n_jobs=-1), RFC(1200, n_jobs=-1),
                ETC(600, n_jobs=-1), ETC(1200, n_jobs=-1)],
    "window_size": [3000, 4000],
    "overlap_rate": [0.5, 0.75]
}
GridSearch(parameters, filename="again_with_freq_speed_acc_distance")

100%|██████████| 16/16 [50:48<00:00, 190.50s/it]


result exported to: ../again_with_freq_speed_acc_distance_2021-08-16 14.15.59.csv


# LOOCV with less_feature, speed, acceleration, distance

In [43]:
def joint_distance(x_data, joint1, joint2):
    """
    returns the distance between two joints. 
    """
    x1, y1, z1 = x_data[f"{joint1}_X"], x_data[f"{joint1}_Y"], x_data[f"{joint1}_Z"]
    x2, y2, z2 = x_data[f"{joint2}_X"], x_data[f"{joint2}_Y"], x_data[f"{joint2}_Z"]
    distance = np.sqrt((x2-x1)**2 + (y2-y1)**2 + (z2-z1)**2)
    return distance

def get_all_joint_distances(x_data):
    """
    calculates all the necessary joint distances from the `x_data`, 
    adds columns to it and returns the modified `x_data`.
    the two joints should not be essentially consecutive, 
    because the distance between two consecutive joints is always constant.
    For example, distance between left_wrist and left_elbow is always constant.
    """
    # joints
    # Front head        ->  left shoulder       (1->8)
    x_data["dist_FH_LS"] = joint_distance(x_data, "FH", "LS")
    # Front head        ->  right shoulder      (1->4)
    x_data["dist_FH_RS"] = joint_distance(x_data, "FH", "RS")
    # left shoulder     ->  left wrist          (8->10)
    x_data["dist_LS_LW"] = joint_distance(x_data, "LS", "LW")
    # right shoulder    ->  right wrist         (4->7)
    x_data["dist_RS_RW"] = joint_distance(x_data, "RS", "RW")
    # v sacral          ->  left elbow          (13->9)
    x_data["dist_VS_LE"] = joint_distance(x_data, "VS", "LE")
    # v sacral          ->  right elbow         (13->6)
    x_data["dist_VS_RE"] = joint_distance(x_data, "VS", "RE")
    # v sacral          ->  left wrist          (13->10)
    x_data["dist_VS_LW"] = joint_distance(x_data, "VS", "LW")
    # v sacral          ->  right wrist         (13->7)
    x_data["dist_VS_RW"] = joint_distance(x_data, "VS", "RW")
    # v sacral          ->  rear head           (13->3)
    x_data["dist_VS_RH"] = joint_distance(x_data, "VS", "RH")
    # v sacral          ->  top head            (13->2)
    x_data["dist_VS_TH"] = joint_distance(x_data, "VS", "TH")
    # left wrist        ->  right wrist         (10->7)
    x_data["dist_LW_RW"] = joint_distance(x_data, "LW", "RW")
    # left asis         ->  left wrist          (12->10)
    x_data["dist_LA_LW"] = joint_distance(x_data, "LA", "LW")
    # right asis        ->  right wrist         (11->7)
    x_data["dist_RA_RW"] = joint_distance(x_data, "RA", "RW")
    # left wrist        ->  top head            (10->2)
    x_data["dist_LW_TH"] = joint_distance(x_data, "LW", "TH")
    # right wrist       ->  top head            (7->2)
    x_data["dist_RW_TH"] = joint_distance(x_data, "RW", "TH")
    # top head          ->  left asis           (2->12)
    x_data["dist_TH_LA"] = joint_distance(x_data, "TH", "LA")
    return x_data

In [44]:
def segmentation(df, overlap_rate, time_window):
    seg_data = []
    #convert overlap rate to step for sliding window
    overlap = int((1 - overlap_rate)*time_window)
    # interpolate
    df = df.interpolate().ffill().fillna(0)
    for i in range(0, len(df)-time_window+1, overlap):
        seg_data.append(df.loc[i:i+time_window-1, :].copy().reset_index(drop=True))
    return seg_data

def get_speed_acc(x_data):
    x_data = x_data.drop(columns=["activity", "subject_id"])
    speed = x_data.diff().fillna(0)
    acc = speed.diff().fillna(0)
    speed.columns = [f"{col}_speed" for col in speed.columns]
    acc.columns = [f"{col}_acc" for col in acc.columns]
    return speed, acc

def get_streams(x_data):
    speed, acc = get_speed_acc(x_data)
    x_data = pd.concat([x_data, speed, acc], axis=1)
    x_data = get_all_joint_distances(x_data)
    return x_data

def get_features(x_data):
    features = []
    cols = x_data.columns.tolist()
    #Calculate features (STD, Average, Max, Min, Median, Variance) for each data columns X Y Z 
    for k in cols:
        features.append(x_data[k].std(ddof=0))
        # features.append(np.average(x_data[k]))
        features.append(np.max(x_data[k]))
        features.append(np.min(x_data[k]))
        features.append(np.median(x_data[k]))        
        # features.append(np.var(x_data[k]))
        # fd = np.abs(fft(np.array(x_data[k])))**2
        # features.append(stats.skew(fd))
        # features.append(stats.kurtosis(fd))
    return features

def rename_columns(df):
    df.columns = [
        "FH_X", "FH_Y", "FH_Z",     #1
        "TH_X", "TH_Y", "TH_Z",     #2
        "RH_X", "RH_Y", "RH_Z",     #3
        "RS_X", "RS_Y", "RS_Z",     #4
        "RO_X", "RO_Y", "RO_Z",     #5
        "RE_X", "RE_Y", "RE_Z",     #6
        "RW_X", "RW_Y", "RW_Z",     #7
        "LS_X", "LS_Y", "LS_Z",     #8
        "LE_X", "LE_Y", "LE_Z",     #9
        "LW_X", "LW_Y", "LW_Z",     #10
        "RA_X", "RA_Y", "RA_Z",     #11
        "LA_X", "LA_Y", "LA_Z",     #12
        "VS_X", "VS_Y", "VS_Z",     #13
        "subject_id", "activity",   # Other columns
    ]
    return df

def dataloader(overlap, window_size, verbose=True):
    if verbose:
        print("loading the data...", end="\t")
    data_list = []
    file_lengths = {1: [], 2: [], 3: []}
    files = tqdm(glob.glob("../TrainData/*/*/*.csv")) if verbose else glob.glob("../TrainData/*/*/*.csv")
    for file in files:
        tempdf = pd.read_csv(file)
        tempdf = rename_columns(tempdf)
        segmented_data = segmentation(tempdf, overlap, window_size)
        if len(segmented_data)>0:
            person = segmented_data[0].reset_index(drop=True).loc[0, "subject_id"]
            file_lengths[person].append(len(segmented_data))   
        data_list.extend(segmented_data)
    return data_list, file_lengths


def feature_extractor(data_list, verbose=True):
    if verbose:
        print(f"extracting the features...", end="  ")
    X, y = {1:[], 2:[], 3:[]}, {1:[], 2:[], 3:[]}
    num_range = trange(0,len(data_list)) if verbose else range(0,len(data_list))
    for j in num_range:
        #extract only xyz columns
        person = data_list[j].loc[0, "subject_id"]
        x_data = data_list[j].drop(columns=["subject_id","activity"])
        X[person].append(get_features(x_data))
        y[person].append(data_list[j].reset_index(drop=True).loc[0, "activity"])
    return X, y


def majority_voting(predictions, file_lengths):
    filtered_predictions = []
    index = 0
    for length in file_lengths:
        file_pred = predictions[index:index+length]
        modes = mode(file_pred)
        majority_choice = modes.mode[0]
        filtered_predictions.extend([majority_choice]*length)
        index += length
    return filtered_predictions

In [45]:
def LOOCV_train_evaluate(model, overlap_rate, window_size, voting=True, n_repeats=1, verbose=True):
    scores = []
    data_list, file_lengths = dataloader(overlap_rate, window_size, verbose=verbose)
    stream_list = []
    for df in data_list:
        stream_list.append(get_streams(df))
    X, y = feature_extractor(stream_list, verbose=verbose)
    num_range = trange(n_repeats) if verbose else range(n_repeats)
    for _ in num_range:
        for p1, p2, p3 in [(1,2,3), (2,3,1), (3,1,2)]:
            X_test, y_test = X[p1], y[p1]
            X_train = X[p2] + X[p3]
            y_train = y[p2] + y[p3]
            # print(f"training model for person {p1}/3...", end="\t")
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            if voting:
                filtered_pred = majority_voting(pred, file_lengths[p1])
                scores.append(accuracy_score(y_test, filtered_pred))
            else:
                scores.append(accuracy_score(y_test, pred))
    if verbose:
        print(f"\nMean Score: {np.mean(scores)}")
    return scores

In [46]:
def GridSearch(parameters, csvpath = "..", filename="gridCV_results", n_repeats=7, verbose=False, progress=True):
    score_df = pd.DataFrame({"model": [], "window_size": [], "overlap_rate": [], "n_repeats":[], 
                            "avg_score": [], "scores":[]})
    models, window_sizes, overlap_rates = parameters["model"], parameters["window_size"], parameters["overlap_rate"]
    combinations = [(i,j,k) for i in models for j in window_sizes for k in overlap_rates]
    if progress:
        combinations = tqdm(combinations)
    for combination in combinations:
        model, window_size, overlap_rate = combination
        scores = LOOCV_train_evaluate(model, overlap_rate, window_size, n_repeats=n_repeats, verbose=verbose)
        score_df = score_df.append({"model": model.__str__(), "window_size": window_size, 
                    "overlap_rate": overlap_rate, "n_repeats": n_repeats, 
                    "avg_score": np.mean(scores), "scores": scores}, ignore_index=True)
    savepath = f"{csvpath}/{filename}_{str(datetime.now())[:-7]}.csv".replace(":", ".")
    score_df.to_csv(savepath, index=False)
    print(f"result exported to: {savepath}")


In [47]:
parameters = {
    "model": [RFC(600, n_jobs=-1), RFC(1200, n_jobs=-1),
                ETC(600, n_jobs=-1), ETC(1200, n_jobs=-1)],
    "window_size": [3000, 4000],
    "overlap_rate": [0.5, 0.75]
}
GridSearch(parameters, filename="again_with_less_feat_speed_acc_distance")

100%|██████████| 16/16 [35:55<00:00, 134.70s/it]


result exported to: ../again_with_freq_speed_acc_distance_2021-08-16 16.37.15.csv


# LOOCV with speed, acceleration, distance

In [5]:
def segmentation(df, overlap_rate, time_window):
    seg_data = []
    #convert overlap rate to step for sliding window
    overlap = int((1 - overlap_rate)*time_window)
    # interpolate
    df = df.interpolate().ffill().fillna(0)
    for i in range(0, len(df)-time_window+1, overlap):
        seg_data.append(df.loc[i:i+time_window-1, :].copy().reset_index(drop=True))
    return seg_data

ddef get_speed_acc(x_data):
    x_data = x_data.drop(columns=["activity", "subject_id"])
    speed = x_data.diff().fillna(0)
    acc = speed.diff().fillna(0)
    speed.columns = [f"{col}_speed" for col in speed.columns]
    acc.columns = [f"{col}_acc" for col in acc.columns]
    return speed, acc

def get_streams(x_data):
    speed, acc = get_speed_acc(x_data)
    x_data = pd.concat([x_data, speed, acc], axis=1)
    x_data = get_all_joint_distances(x_data)
    return x_data

def get_features(x_data):
    features = []
    cols = x_data.columns.tolist()
    #Calculate features (STD, Average, Max, Min, Median, Variance) for each data columns X Y Z 
    for k in cols:
        features.append(x_data[k].std(ddof=0))
        features.append(np.average(x_data[k]))
        features.append(np.max(x_data[k]))
        features.append(np.min(x_data[k]))
        features.append(np.median(x_data[k]))        
        features.append(np.var(x_data[k]))
        # fd = np.abs(fft(np.array(x_data[k])))**2
        # features.append(stats.skew(fd))
        # features.append(stats.kurtosis(fd))
    return features


def dataloader(overlap, window_size, verbose=True):
    if verbose:
        print("loading the data...", end="\t")
    data_list = []
    file_lengths = {1: [], 2: [], 3: []}
    files = tqdm(glob.glob("../TrainData/*/*/*.csv")) if verbose else glob.glob("../TrainData/*/*/*.csv")
    for file in files:
        tempdf = pd.read_csv(file)
        tempdf = rename_columns(tempdf)
        segmented_data = segmentation(tempdf, overlap, window_size)
        if len(segmented_data)>0:
            person = segmented_data[0].reset_index(drop=True).loc[0, "subject_id"]
            file_lengths[person].append(len(segmented_data))   
        data_list.extend(segmented_data)
    return data_list, file_lengths


def feature_extractor(data_list, verbose=True):
    if verbose:
        print(f"extracting the features...", end="  ")
    X, y = {1:[], 2:[], 3:[]}, {1:[], 2:[], 3:[]}
    num_range = trange(0,len(data_list)) if verbose else range(0,len(data_list))
    for j in num_range:
        #extract only xyz columns
        person = data_list[j].loc[0, "subject_id"]
        x_data = data_list[j].drop(columns=["subject_id","activity"])
        X[person].append(get_features(x_data))
        y[person].append(data_list[j].reset_index(drop=True).loc[0, "activity"])
    return X, y


def majority_voting(predictions, file_lengths):
    filtered_predictions = []
    index = 0
    for length in file_lengths:
        file_pred = predictions[index:index+length]
        modes = mode(file_pred)
        majority_choice = modes.mode[0]
        filtered_predictions.extend([majority_choice]*length)
        index += length
    return filtered_predictions

In [6]:
def LOOCV_train_evaluate(model, overlap_rate, window_size, voting=True, n_repeats=1, verbose=True):
    scores = []
    data_list, file_lengths = dataloader(overlap_rate, window_size, verbose=verbose)
    stream_list = []
    for df in data_list:
        stream_list.append(get_streams(df))
    X, y = feature_extractor(stream_list, verbose=verbose)
    num_range = trange(n_repeats) if verbose else range(n_repeats)
    for _ in num_range:
        for p1, p2, p3 in [(1,2,3), (2,3,1), (3,1,2)]:
            X_test, y_test = X[p1], y[p1]
            X_train = X[p2] + X[p3]
            y_train = y[p2] + y[p3]
            # print(f"training model for person {p1}/3...", end="\t")
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            if voting:
                filtered_pred = majority_voting(pred, file_lengths[p1])
                scores.append(accuracy_score(y_test, filtered_pred))
            else:
                scores.append(accuracy_score(y_test, pred))
    if verbose:
        print(f"\nMean Score: {np.mean(scores)}")
    return scores

In [7]:
def GridSearch(parameters, csvpath = "..", filename="gridCV_results", n_repeats=7, verbose=False, progress=True):
    score_df = pd.DataFrame({"model": [], "window_size": [], "overlap_rate": [], "n_repeats":[], 
                            "avg_score": [], "scores":[]})
    models, window_sizes, overlap_rates = parameters["model"], parameters["window_size"], parameters["overlap_rate"]
    combinations = [(i,j,k) for i in models for j in window_sizes for k in overlap_rates]
    if progress:
        combinations = tqdm(combinations)
    for combination in combinations:
        model, window_size, overlap_rate = combination
        scores = LOOCV_train_evaluate(model, overlap_rate, window_size, n_repeats=n_repeats, verbose=verbose)
        score_df = score_df.append({"model": model.__str__(), "window_size": window_size, 
                    "overlap_rate": overlap_rate, "n_repeats": n_repeats, 
                    "avg_score": np.mean(scores), "scores": scores}, ignore_index=True)
    savepath = f"{csvpath}/{filename}_{str(datetime.now())[:-7]}.csv".replace(":", ".")
    score_df.to_csv(savepath, index=False)
    print(f"result exported to: {savepath}")


In [None]:
parameters = {
    "model": [RFC(300, n_jobs=-1), RFC(600, n_jobs=-1), RFC(1200, n_jobs=-1),
                ETC(300, n_jobs=-1), ETC(600, n_jobs=-1), ETC(1200, n_jobs=-1)],
    "window_size": [3000, 4000],
    "overlap_rate": [0.5, 0.75]
}
GridSearch(parameters)

# LOOCV with frequency, speed, acceleration and angles 

In [19]:
def joint_angle(x_data, joint1, joint2, joint3):
    x1, y1, z1 = x_data[f"{joint1}_X"], x_data[f"{joint1}_Y"], x_data[f"{joint1}_Z"]
    x2, y2, z2 = x_data[f"{joint2}_X"], x_data[f"{joint2}_Y"], x_data[f"{joint2}_Z"]
    x3, y3, z3 = x_data[f"{joint3}_X"], x_data[f"{joint3}_Y"], x_data[f"{joint3}_Z"]
    v1 = np.array([x2-x1, y2-y1, z2-z1]).T
    v2 = np.array([x3-x2, y3-y2, z3-z2]).T
    v1_unit = v1/np.expand_dims(np.linalg.norm(v1, axis=1), axis=1)
    v2_unit = v2/np.expand_dims(np.linalg.norm(v2, axis=1), axis=1)
    angle = np.arccos(np.sum(v1_unit*v2_unit, axis=1)) # dot multiplication
    return angle

def get_all_joint_angles(x_data):
    # joints
    # left shoulder     ->  left elbow      ->  left wrist      (8->9->10)
    x_data["angle_LS_LE_LW"] = joint_angle(x_data, "LS", "LE", "LW")
    # right shoulder    ->  right elbow     ->  right wrist     (4->6->7)
    x_data["angle_RS_RE_RW"] = joint_angle(x_data, "RS", "RE", "RW")
    # right shoulder    ->  left shoulder   ->  front head      (4->8->1)
    x_data["angle_RS_LS_FH"] = joint_angle(x_data, "RS", "LS", "FH")
    # right shoulder    ->  left shoulder   ->  left elbow      (4->8->9)
    x_data["angle_RS_LS_LE"] = joint_angle(x_data, "RS", "LS", "LE")
    # left shoulder     ->  right shoulder  ->  right elbow     (8->4->6)
    x_data["angle_LS_RS_RE"] = joint_angle(x_data, "LS", "RS", "RE")
    # v sacral          ->  right offset    ->  rear head       (13->5->3)
    x_data["angle_VS_RO_RH"] = joint_angle(x_data, "VS", "RO", "RH")
    # vsacral           ->  top head        ->  front head      (13->2->1)
    x_data["angle_VS_TH_FH"] = joint_angle(x_data, "VS", "TH", "FH")
    # v sacral          ->  left shoulder   ->  left elbow      (13->8->9)
    x_data["angle_VS_LS_LE"] = joint_angle(x_data, "VS", "LS", "LE")
    # v sacral          ->  right shoulder  ->  right elbow     (13->4->6)
    x_data["angle_VS_RS_RE"] = joint_angle(x_data, "VS", "RS", "RE")
    # left asis         ->  left shoulder   ->  left elbow      (12->8->9)
    x_data["angle_LA_LS_LE"] = joint_angle(x_data, "LA", "LS", "LE")
    # right asis        -> right shoulder   ->  right elbow     (11->4->6)
    x_data["angle_RA_RS_RE"] = joint_angle(x_data, "RA", "RS", "RE")
    return x_data

In [20]:
def segmentation(df, overlap_rate, time_window):
    seg_data = []
    #convert overlap rate to step for sliding window
    overlap = int((1 - overlap_rate)*time_window)
    # interpolate
    df = df.interpolate().ffill().fillna(0)
    for i in range(0, len(df)-time_window+1, overlap):
        seg_data.append(df.loc[i:i+time_window-1, :].copy().reset_index(drop=True))
    return seg_data

def get_speed_acc(x_data):
    x_data = x_data.drop(columns=["activity", "subject_id"])
    speed = x_data.diff().fillna(0)
    acc = speed.diff().fillna(0)
    speed.columns = [f"{col}_speed" for col in speed.columns]
    acc.columns = [f"{col}_acc" for col in acc.columns]
    return speed, acc

def get_streams(x_data):
    speed, acc = get_speed_acc(x_data)
    x_data = pd.concat([x_data, speed, acc], axis=1)
    x_data = get_all_joint_angles(x_data)
    return x_data

def get_features(x_data):
    features = []
    cols = x_data.columns.tolist()
    #Calculate features (STD, Average, Max, Min, Median, Variance) for each data columns X Y Z 
    for k in cols:
        features.append(x_data[k].std(ddof=0))
        features.append(np.average(x_data[k]))
        features.append(np.max(x_data[k]))
        features.append(np.min(x_data[k]))
        features.append(np.median(x_data[k]))        
        features.append(np.var(x_data[k]))
        fd = np.abs(fft(np.array(x_data[k])))**2
        features.append(stats.skew(fd))
        features.append(stats.kurtosis(fd))
    return features

def rename_columns(df):
    df.columns = [
        "FH_X", "FH_Y", "FH_Z",     #1
        "TH_X", "TH_Y", "TH_Z",     #2
        "RH_X", "RH_Y", "RH_Z",     #3
        "RS_X", "RS_Y", "RS_Z",     #4
        "RO_X", "RO_Y", "RO_Z",     #5
        "RE_X", "RE_Y", "RE_Z",     #6
        "RW_X", "RW_Y", "RW_Z",     #7
        "LS_X", "LS_Y", "LS_Z",     #8
        "LE_X", "LE_Y", "LE_Z",     #9
        "LW_X", "LW_Y", "LW_Z",     #10
        "RA_X", "RA_Y", "RA_Z",     #11
        "LA_X", "LA_Y", "LA_Z",     #12
        "VS_X", "VS_Y", "VS_Z",     #13
        "subject_id", "activity",   # Other columns
    ]
    return df

def dataloader(overlap, window_size, verbose=True):
    if verbose:
        print("loading the data...", end="\t")
    data_list = []
    file_lengths = {1: [], 2: [], 3: []}
    files = tqdm(glob.glob("../TrainData/*/*/*.csv")) if verbose else glob.glob("../TrainData/*/*/*.csv")
    for file in files:
        tempdf = pd.read_csv(file)
        tempdf = rename_columns(tempdf)
        segmented_data = segmentation(tempdf, overlap, window_size)
        if len(segmented_data)>0:
            person = segmented_data[0].reset_index(drop=True).loc[0, "subject_id"]
            file_lengths[person].append(len(segmented_data))   
        data_list.extend(segmented_data)
    return data_list, file_lengths


def feature_extractor(data_list, verbose=True):
    if verbose:
        print(f"extracting the features...", end="  ")
    X, y = {1:[], 2:[], 3:[]}, {1:[], 2:[], 3:[]}
    num_range = trange(0,len(data_list)) if verbose else range(0,len(data_list))
    for j in num_range:
        #extract only xyz columns
        person = data_list[j].loc[0, "subject_id"]
        x_data = data_list[j].drop(columns=["subject_id","activity"])
        X[person].append(get_features(x_data))
        y[person].append(data_list[j].reset_index(drop=True).loc[0, "activity"])
    return X, y


def majority_voting(predictions, file_lengths):
    filtered_predictions = []
    index = 0
    for length in file_lengths:
        file_pred = predictions[index:index+length]
        modes = mode(file_pred)
        majority_choice = modes.mode[0]
        filtered_predictions.extend([majority_choice]*length)
        index += length
    return filtered_predictions

In [21]:
def LOOCV_train_evaluate(model, overlap_rate, window_size, voting=True, n_repeats=1, verbose=True):
    scores = []
    data_list, file_lengths = dataloader(overlap_rate, window_size, verbose=verbose)
    stream_list = []
    for df in data_list:
        stream_list.append(get_streams(df))
    X, y = feature_extractor(stream_list, verbose=verbose)
    num_range = trange(n_repeats) if verbose else range(n_repeats)
    for _ in num_range:
        for p1, p2, p3 in [(1,2,3), (2,3,1), (3,1,2)]:
            X_test, y_test = X[p1], y[p1]
            X_train = X[p2] + X[p3]
            y_train = y[p2] + y[p3]
            # print(f"training model for person {p1}/3...", end="\t")
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            if voting:
                filtered_pred = majority_voting(pred, file_lengths[p1])
                scores.append(accuracy_score(y_test, filtered_pred))
            else:
                scores.append(accuracy_score(y_test, pred))
    if verbose:
        print(f"\nMean Score: {np.mean(scores)}")
    return scores

In [22]:
def GridSearch(parameters, csvpath = "..", filename="gridCV_results", n_repeats=7, verbose=False, progress=True):
    score_df = pd.DataFrame({"model": [], "window_size": [], "overlap_rate": [], "n_repeats":[], 
                            "avg_score": [], "scores":[]})
    models, window_sizes, overlap_rates = parameters["model"], parameters["window_size"], parameters["overlap_rate"]
    combinations = [(i,j,k) for i in models for j in window_sizes for k in overlap_rates]
    if progress:
        combinations = tqdm(combinations)
    for combination in combinations:
        model, window_size, overlap_rate = combination
        scores = LOOCV_train_evaluate(model, overlap_rate, window_size, n_repeats=n_repeats, verbose=verbose)
        score_df = score_df.append({"model": model.__str__(), "window_size": window_size, 
                    "overlap_rate": overlap_rate, "n_repeats": n_repeats, 
                    "avg_score": np.mean(scores), "scores": scores}, ignore_index=True)
    savepath = f"{csvpath}/{filename}_{str(datetime.now())[:-7]}.csv".replace(":", ".")
    score_df.to_csv(savepath, index=False)
    print(f"result exported to: {savepath}")


In [11]:
parameters = {
    "model": [RFC(600, n_jobs=-1), RFC(1200, n_jobs=-1),
              ETC(600, n_jobs=-1), ETC(1200, n_jobs=-1)],
    "window_size": [3000, 4000],
    "overlap_rate": [0.5, 0.75]
}
GridSearch(parameters, filename="again_with_freq_speed_acc_angles")

100%|██████████| 20/20 [1:17:55<00:00, 233.78s/it]


result exported to: ../gridCV_results_2021-08-14 19.01.56.csv


# LOOCV with frequency, speed, acceleration, distance, angle

In [29]:
def joint_distance(x_data, joint1, joint2):
    """
    returns the distance between two joints. 
    """
    x1, y1, z1 = x_data[f"{joint1}_X"], x_data[f"{joint1}_Y"], x_data[f"{joint1}_Z"]
    x2, y2, z2 = x_data[f"{joint2}_X"], x_data[f"{joint2}_Y"], x_data[f"{joint2}_Z"]
    distance = np.sqrt((x2-x1)**2 + (y2-y1)**2 + (z2-z1)**2)
    return distance

def get_all_joint_distances(x_data):
    """
    calculates all the necessary joint distances from the `x_data`, 
    adds columns to it and returns the modified `x_data`.
    the two joints should not be essentially consecutive, 
    because the distance between two consecutive joints is always constant.
    For example, distance between left_wrist and left_elbow is always constant.
    """
    # joints
    # Front head        ->  left shoulder       (1->8)
    x_data["dist_FH_LS"] = joint_distance(x_data, "FH", "LS")
    # Front head        ->  right shoulder      (1->4)
    x_data["dist_FH_RS"] = joint_distance(x_data, "FH", "RS")
    # left shoulder     ->  left wrist          (8->10)
    x_data["dist_LS_LW"] = joint_distance(x_data, "LS", "LW")
    # right shoulder    ->  right wrist         (4->7)
    x_data["dist_RS_RW"] = joint_distance(x_data, "RS", "RW")
    # v sacral          ->  left elbow          (13->9)
    x_data["dist_VS_LE"] = joint_distance(x_data, "VS", "LE")
    # v sacral          ->  right elbow         (13->6)
    x_data["dist_VS_RE"] = joint_distance(x_data, "VS", "RE")
    # v sacral          ->  left wrist          (13->10)
    x_data["dist_VS_LW"] = joint_distance(x_data, "VS", "LW")
    # v sacral          ->  right wrist         (13->7)
    x_data["dist_VS_RW"] = joint_distance(x_data, "VS", "RW")
    # v sacral          ->  rear head           (13->3)
    x_data["dist_VS_RH"] = joint_distance(x_data, "VS", "RH")
    # v sacral          ->  top head            (13->2)
    x_data["dist_VS_TH"] = joint_distance(x_data, "VS", "TH")
    # left wrist        ->  right wrist         (10->7)
    x_data["dist_LW_RW"] = joint_distance(x_data, "LW", "RW")
    # left asis         ->  left wrist          (12->10)
    x_data["dist_LA_LW"] = joint_distance(x_data, "LA", "LW")
    # right asis        ->  right wrist         (11->7)
    x_data["dist_RA_RW"] = joint_distance(x_data, "RA", "RW")
    # left wrist        ->  top head            (10->2)
    x_data["dist_LW_TH"] = joint_distance(x_data, "LW", "TH")
    # right wrist       ->  top head            (7->2)
    x_data["dist_RW_TH"] = joint_distance(x_data, "RW", "TH")
    # top head          ->  left asis           (2->12)
    x_data["dist_TH_LA"] = joint_distance(x_data, "TH", "LA")
    return x_data

In [30]:
def joint_angle(x_data, joint1, joint2, joint3):
    x1, y1, z1 = x_data[f"{joint1}_X"], x_data[f"{joint1}_Y"], x_data[f"{joint1}_Z"]
    x2, y2, z2 = x_data[f"{joint2}_X"], x_data[f"{joint2}_Y"], x_data[f"{joint2}_Z"]
    x3, y3, z3 = x_data[f"{joint3}_X"], x_data[f"{joint3}_Y"], x_data[f"{joint3}_Z"]
    v1 = np.array([x2-x1, y2-y1, z2-z1]).T
    v2 = np.array([x3-x2, y3-y2, z3-z2]).T
    v1_unit = v1/np.expand_dims(np.linalg.norm(v1, axis=1), axis=1)
    v2_unit = v2/np.expand_dims(np.linalg.norm(v2, axis=1), axis=1)
    angle = np.arccos(np.sum(v1_unit*v2_unit, axis=1)) # dot multiplication
    return angle

def get_all_joint_angles(x_data):
    # joints
    # left shoulder     ->  left elbow      ->  left wrist      (8->9->10)
    x_data["angle_LS_LE_LW"] = joint_angle(x_data, "LS", "LE", "LW")
    # right shoulder    ->  right elbow     ->  right wrist     (4->6->7)
    x_data["angle_RS_RE_RW"] = joint_angle(x_data, "RS", "RE", "RW")
    # right shoulder    ->  left shoulder   ->  front head      (4->8->1)
    x_data["angle_RS_LS_FH"] = joint_angle(x_data, "RS", "LS", "FH")
    # right shoulder    ->  left shoulder   ->  left elbow      (4->8->9)
    x_data["angle_RS_LS_LE"] = joint_angle(x_data, "RS", "LS", "LE")
    # left shoulder     ->  right shoulder  ->  right elbow     (8->4->6)
    x_data["angle_LS_RS_RE"] = joint_angle(x_data, "LS", "RS", "RE")
    # v sacral          ->  right offset    ->  rear head       (13->5->3)
    x_data["angle_VS_RO_RH"] = joint_angle(x_data, "VS", "RO", "RH")
    # vsacral           ->  top head        ->  front head      (13->2->1)
    x_data["angle_VS_TH_FH"] = joint_angle(x_data, "VS", "TH", "FH")
    # v sacral          ->  left shoulder   ->  left elbow      (13->8->9)
    x_data["angle_VS_LS_LE"] = joint_angle(x_data, "VS", "LS", "LE")
    # v sacral          ->  right shoulder  ->  right elbow     (13->4->6)
    x_data["angle_VS_RS_RE"] = joint_angle(x_data, "VS", "RS", "RE")
    # left asis         ->  left shoulder   ->  left elbow      (12->8->9)
    x_data["angle_LA_LS_LE"] = joint_angle(x_data, "LA", "LS", "LE")
    # right asis        -> right shoulder   ->  right elbow     (11->4->6)
    x_data["angle_RA_RS_RE"] = joint_angle(x_data, "RA", "RS", "RE")
    return x_data

In [31]:
def segmentation(df, overlap_rate, time_window):
    seg_data = []
    #convert overlap rate to step for sliding window
    overlap = int((1 - overlap_rate)*time_window)
    # interpolate
    df = df.interpolate().ffill().fillna(0)
    for i in range(0, len(df)-time_window+1, overlap):
        seg_data.append(df.loc[i:i+time_window-1, :].copy().reset_index(drop=True))
    return seg_data

def get_speed_acc(x_data):
    x_data = x_data.drop(columns=["activity", "subject_id"])
    speed = x_data.diff().fillna(0)
    acc = speed.diff().fillna(0)
    speed.columns = [f"{col}_speed" for col in speed.columns]
    acc.columns = [f"{col}_acc" for col in acc.columns]
    return speed, acc

def get_streams(x_data):
    speed, acc = get_speed_acc(x_data)
    x_data = pd.concat([x_data, speed, acc], axis=1)
    x_data = get_all_joint_distances(x_data)
    x_data = get_all_joint_angles(x_data)
    return x_data

def get_features(x_data):
    features = []
    cols = x_data.columns.tolist()
    #Calculate features (STD, Average, Max, Min, Median, Variance) for each data columns X Y Z 
    for k in cols:
        features.append(x_data[k].std(ddof=0))
        features.append(np.average(x_data[k]))
        features.append(np.max(x_data[k]))
        features.append(np.min(x_data[k]))
        features.append(np.median(x_data[k]))        
        features.append(np.var(x_data[k]))
        fd = np.abs(fft(np.array(x_data[k])))**2
        features.append(stats.skew(fd))
        features.append(stats.kurtosis(fd))
    return features

def rename_columns(df):
    df.columns = [
        "FH_X", "FH_Y", "FH_Z",     #1
        "TH_X", "TH_Y", "TH_Z",     #2
        "RH_X", "RH_Y", "RH_Z",     #3
        "RS_X", "RS_Y", "RS_Z",     #4
        "RO_X", "RO_Y", "RO_Z",     #5
        "RE_X", "RE_Y", "RE_Z",     #6
        "RW_X", "RW_Y", "RW_Z",     #7
        "LS_X", "LS_Y", "LS_Z",     #8
        "LE_X", "LE_Y", "LE_Z",     #9
        "LW_X", "LW_Y", "LW_Z",     #10
        "RA_X", "RA_Y", "RA_Z",     #11
        "LA_X", "LA_Y", "LA_Z",     #12
        "VS_X", "VS_Y", "VS_Z",     #13
        "subject_id", "activity",   # Other columns
    ]
    return df

def dataloader(overlap, window_size, verbose=True):
    if verbose:
        print("loading the data...", end="\t")
    data_list = []
    file_lengths = {1: [], 2: [], 3: []}
    files = tqdm(glob.glob("../TrainData/*/*/*.csv")) if verbose else glob.glob("../TrainData/*/*/*.csv")
    for file in files:
        tempdf = pd.read_csv(file)
        tempdf = rename_columns(tempdf)
        segmented_data = segmentation(tempdf, overlap, window_size)
        if len(segmented_data)>0:
            person = segmented_data[0].reset_index(drop=True).loc[0, "subject_id"]
            file_lengths[person].append(len(segmented_data))   
        data_list.extend(segmented_data)
    return data_list, file_lengths


def feature_extractor(data_list, verbose=True):
    if verbose:
        print(f"extracting the features...", end="  ")
    X, y = {1:[], 2:[], 3:[]}, {1:[], 2:[], 3:[]}
    num_range = trange(0,len(data_list)) if verbose else range(0,len(data_list))
    for j in num_range:
        #extract only xyz columns
        person = data_list[j].loc[0, "subject_id"]
        x_data = data_list[j].drop(columns=["subject_id","activity"])
        X[person].append(get_features(x_data))
        y[person].append(data_list[j].reset_index(drop=True).loc[0, "activity"])
    return X, y


def majority_voting(predictions, file_lengths):
    filtered_predictions = []
    index = 0
    for length in file_lengths:
        file_pred = predictions[index:index+length]
        modes = mode(file_pred)
        majority_choice = modes.mode[0]
        filtered_predictions.extend([majority_choice]*length)
        index += length
    return filtered_predictions

In [32]:
def LOOCV_train_evaluate(model, overlap_rate, window_size, voting=True, n_repeats=1, verbose=True):
    scores = []
    data_list, file_lengths = dataloader(overlap_rate, window_size, verbose=verbose)
    stream_list = []
    for df in data_list:
        stream_list.append(get_streams(df))
    X, y = feature_extractor(stream_list, verbose=verbose)
    num_range = trange(n_repeats) if verbose else range(n_repeats)
    for _ in num_range:
        for p1, p2, p3 in [(1,2,3), (2,3,1), (3,1,2)]:
            X_test, y_test = X[p1], y[p1]
            X_train = X[p2] + X[p3]
            y_train = y[p2] + y[p3]
            # print(f"training model for person {p1}/3...", end="\t")
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            if voting:
                filtered_pred = majority_voting(pred, file_lengths[p1])
                scores.append(accuracy_score(y_test, filtered_pred))
            else:
                scores.append(accuracy_score(y_test, pred))
    if verbose:
        print(f"\nMean Score: {np.mean(scores)}")
    return scores

In [33]:
def GridSearch(parameters, csvpath = "..", filename="gridCV_results", n_repeats=7, verbose=False, progress=True):
    score_df = pd.DataFrame({"model": [], "window_size": [], "overlap_rate": [], "n_repeats":[], 
                            "avg_score": [], "scores":[]})
    models, window_sizes, overlap_rates = parameters["model"], parameters["window_size"], parameters["overlap_rate"]
    combinations = [(i,j,k) for i in models for j in window_sizes for k in overlap_rates]
    if progress:
        combinations = tqdm(combinations)
    for combination in combinations:
        model, window_size, overlap_rate = combination
        scores = LOOCV_train_evaluate(model, overlap_rate, window_size, n_repeats=n_repeats, verbose=verbose)
        score_df = score_df.append({"model": model.__str__(), "window_size": window_size, 
                    "overlap_rate": overlap_rate, "n_repeats": n_repeats, 
                    "avg_score": np.mean(scores), "scores": scores}, ignore_index=True)
    savepath = f"{csvpath}/{filename}_{str(datetime.now())[:-7]}.csv".replace(":", ".")
    score_df.to_csv(savepath, index=False)
    print(f"result exported to: {savepath}")


In [35]:
parameters = {
    "model": [RFC(600, n_jobs=-1), RFC(1200, n_jobs=-1),
                ETC(600, n_jobs=-1), ETC(1200, n_jobs=-1)],
    "window_size": [3000, 4000],
    "overlap_rate": [0.5, 0.75]
}
GridSearch(parameters, filename="again_with_speed_acc_angles_distances")

100%|██████████| 16/16 [1:16:08<00:00, 285.52s/it]

result exported to: ../again_with_speed_acc_angles_distances_2021-08-16 15.42.02.csv





# LOOCV with frequency, speed, acceleration, distance, plane

In [2]:
def joint_distance(x_data, joint1, joint2):
    """
    returns the distance between two joints. 
    """
    x1, y1, z1 = x_data[f"{joint1}_X"], x_data[f"{joint1}_Y"], x_data[f"{joint1}_Z"]
    x2, y2, z2 = x_data[f"{joint2}_X"], x_data[f"{joint2}_Y"], x_data[f"{joint2}_Z"]
    distance = np.sqrt((x2-x1)**2 + (y2-y1)**2 + (z2-z1)**2)
    return distance

def get_all_joint_distances(x_data):
    """
    calculates all the necessary joint distances from the `x_data`, 
    adds columns to it and returns the modified `x_data`.
    the two joints should not be essentially consecutive, 
    because the distance between two consecutive joints is always constant.
    For example, distance between left_wrist and left_elbow is always constant.
    """
    # joints
    # Front head        ->  left shoulder       (1->8)
    x_data["dist_FH_LS"] = joint_distance(x_data, "FH", "LS")
    # Front head        ->  right shoulder      (1->4)
    x_data["dist_FH_RS"] = joint_distance(x_data, "FH", "RS")
    # left shoulder     ->  left wrist          (8->10)
    x_data["dist_LS_LW"] = joint_distance(x_data, "LS", "LW")
    # right shoulder    ->  right wrist         (4->7)
    x_data["dist_RS_RW"] = joint_distance(x_data, "RS", "RW")
    # v sacral          ->  left elbow          (13->9)
    x_data["dist_VS_LE"] = joint_distance(x_data, "VS", "LE")
    # v sacral          ->  right elbow         (13->6)
    x_data["dist_VS_RE"] = joint_distance(x_data, "VS", "RE")
    # v sacral          ->  left wrist          (13->10)
    x_data["dist_VS_LW"] = joint_distance(x_data, "VS", "LW")
    # v sacral          ->  right wrist         (13->7)
    x_data["dist_VS_RW"] = joint_distance(x_data, "VS", "RW")
    # v sacral          ->  rear head           (13->3)
    x_data["dist_VS_RH"] = joint_distance(x_data, "VS", "RH")
    # v sacral          ->  top head            (13->2)
    x_data["dist_VS_TH"] = joint_distance(x_data, "VS", "TH")
    # left wrist        ->  right wrist         (10->7)
    x_data["dist_LW_RW"] = joint_distance(x_data, "LW", "RW")
    # left asis         ->  left wrist          (12->10)
    x_data["dist_LA_LW"] = joint_distance(x_data, "LA", "LW")
    # right asis        ->  right wrist         (11->7)
    x_data["dist_RA_RW"] = joint_distance(x_data, "RA", "RW")
    # left wrist        ->  top head            (10->2)
    x_data["dist_LW_TH"] = joint_distance(x_data, "LW", "TH")
    # right wrist       ->  top head            (7->2)
    x_data["dist_RW_TH"] = joint_distance(x_data, "RW", "TH")
    # top head          ->  left asis           (2->12)
    x_data["dist_TH_LA"] = joint_distance(x_data, "TH", "LA")
    return x_data

# LOOCV with speed, acceleration, distance

In [5]:
def segmentation(df, overlap_rate, time_window):
    seg_data = []
    #convert overlap rate to step for sliding window
    overlap = int((1 - overlap_rate)*time_window)
    # interpolate
    df = df.interpolate().ffill().fillna(0)
    for i in range(0, len(df)-time_window+1, overlap):
        seg_data.append(df.loc[i:i+time_window-1, :].copy().reset_index(drop=True))
    return seg_data

ddef get_speed_acc(x_data):
    x_data = x_data.drop(columns=["activity", "subject_id"])
    speed = x_data.diff().fillna(0)
    acc = speed.diff().fillna(0)
    speed.columns = [f"{col}_speed" for col in speed.columns]
    acc.columns = [f"{col}_acc" for col in acc.columns]
    return speed, acc

def get_streams(x_data):
    speed, acc = get_speed_acc(x_data)
    x_data = pd.concat([x_data, speed, acc], axis=1)
    x_data = get_all_joint_distances(x_data)
    return x_data

def get_features(x_data):
    features = []
    cols = x_data.columns.tolist()
    #Calculate features (STD, Average, Max, Min, Median, Variance) for each data columns X Y Z 
    for k in cols:
        features.append(x_data[k].std(ddof=0))
        features.append(np.average(x_data[k]))
        features.append(np.max(x_data[k]))
        features.append(np.min(x_data[k]))
        features.append(np.median(x_data[k]))        
        features.append(np.var(x_data[k]))
        # fd = np.abs(fft(np.array(x_data[k])))**2
        # features.append(stats.skew(fd))
        # features.append(stats.kurtosis(fd))
    return features


def dataloader(overlap, window_size, verbose=True):
    if verbose:
        print("loading the data...", end="\t")
    data_list = []
    file_lengths = {1: [], 2: [], 3: []}
    files = tqdm(glob.glob("../TrainData/*/*/*.csv")) if verbose else glob.glob("../TrainData/*/*/*.csv")
    for file in files:
        tempdf = pd.read_csv(file)
        tempdf = rename_columns(tempdf)
        segmented_data = segmentation(tempdf, overlap, window_size)
        if len(segmented_data)>0:
            person = segmented_data[0].reset_index(drop=True).loc[0, "subject_id"]
            file_lengths[person].append(len(segmented_data))   
        data_list.extend(segmented_data)
    return data_list, file_lengths


def feature_extractor(data_list, verbose=True):
    if verbose:
        print(f"extracting the features...", end="  ")
    X, y = {1:[], 2:[], 3:[]}, {1:[], 2:[], 3:[]}
    num_range = trange(0,len(data_list)) if verbose else range(0,len(data_list))
    for j in num_range:
        #extract only xyz columns
        person = data_list[j].loc[0, "subject_id"]
        x_data = data_list[j].drop(columns=["subject_id","activity"])
        X[person].append(get_features(x_data))
        y[person].append(data_list[j].reset_index(drop=True).loc[0, "activity"])
    return X, y


def majority_voting(predictions, file_lengths):
    filtered_predictions = []
    index = 0
    for length in file_lengths:
        file_pred = predictions[index:index+length]
        modes = mode(file_pred)
        majority_choice = modes.mode[0]
        filtered_predictions.extend([majority_choice]*length)
        index += length
    return filtered_predictions

In [6]:
def LOOCV_train_evaluate(model, overlap_rate, window_size, voting=True, n_repeats=1, verbose=True):
    scores = []
    data_list, file_lengths = dataloader(overlap_rate, window_size, verbose=verbose)
    stream_list = []
    for df in data_list:
        stream_list.append(get_streams(df))
    X, y = feature_extractor(stream_list, verbose=verbose)
    num_range = trange(n_repeats) if verbose else range(n_repeats)
    for _ in num_range:
        for p1, p2, p3 in [(1,2,3), (2,3,1), (3,1,2)]:
            X_test, y_test = X[p1], y[p1]
            X_train = X[p2] + X[p3]
            y_train = y[p2] + y[p3]
            # print(f"training model for person {p1}/3...", end="\t")
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            if voting:
                filtered_pred = majority_voting(pred, file_lengths[p1])
                scores.append(accuracy_score(y_test, filtered_pred))
            else:
                scores.append(accuracy_score(y_test, pred))
    if verbose:
        print(f"\nMean Score: {np.mean(scores)}")
    return scores

In [7]:
def GridSearch(parameters, csvpath = "..", filename="gridCV_results", n_repeats=7, verbose=False, progress=True):
    score_df = pd.DataFrame({"model": [], "window_size": [], "overlap_rate": [], "n_repeats":[], 
                            "avg_score": [], "scores":[]})
    models, window_sizes, overlap_rates = parameters["model"], parameters["window_size"], parameters["overlap_rate"]
    combinations = [(i,j,k) for i in models for j in window_sizes for k in overlap_rates]
    if progress:
        combinations = tqdm(combinations)
    for combination in combinations:
        model, window_size, overlap_rate = combination
        scores = LOOCV_train_evaluate(model, overlap_rate, window_size, n_repeats=n_repeats, verbose=verbose)
        score_df = score_df.append({"model": model.__str__(), "window_size": window_size, 
                    "overlap_rate": overlap_rate, "n_repeats": n_repeats, 
                    "avg_score": np.mean(scores), "scores": scores}, ignore_index=True)
    savepath = f"{csvpath}/{filename}_{str(datetime.now())[:-7]}.csv".replace(":", ".")
    score_df.to_csv(savepath, index=False)
    print(f"result exported to: {savepath}")


In [None]:
parameters = {
    "model": [RFC(300, n_jobs=-1), RFC(600, n_jobs=-1), RFC(1200, n_jobs=-1),
                ETC(300, n_jobs=-1), ETC(600, n_jobs=-1), ETC(1200, n_jobs=-1)],
    "window_size": [3000, 4000],
    "overlap_rate": [0.5, 0.75]
}
GridSearch(parameters)

# LOOCV with frequency, speed, acceleration and angles 

In [19]:
def joint_angle(x_data, joint1, joint2, joint3):
    x1, y1, z1 = x_data[f"{joint1}_X"], x_data[f"{joint1}_Y"], x_data[f"{joint1}_Z"]
    x2, y2, z2 = x_data[f"{joint2}_X"], x_data[f"{joint2}_Y"], x_data[f"{joint2}_Z"]
    x3, y3, z3 = x_data[f"{joint3}_X"], x_data[f"{joint3}_Y"], x_data[f"{joint3}_Z"]
    v1 = np.array([x2-x1, y2-y1, z2-z1]).T
    v2 = np.array([x3-x2, y3-y2, z3-z2]).T
    v1_unit = v1/np.expand_dims(np.linalg.norm(v1, axis=1), axis=1)
    v2_unit = v2/np.expand_dims(np.linalg.norm(v2, axis=1), axis=1)
    angle = np.arccos(np.sum(v1_unit*v2_unit, axis=1)) # dot multiplication
    return angle

def get_all_joint_angles(x_data):
    # joints
    # left shoulder     ->  left elbow      ->  left wrist      (8->9->10)
    x_data["angle_LS_LE_LW"] = joint_angle(x_data, "LS", "LE", "LW")
    # right shoulder    ->  right elbow     ->  right wrist     (4->6->7)
    x_data["angle_RS_RE_RW"] = joint_angle(x_data, "RS", "RE", "RW")
    # right shoulder    ->  left shoulder   ->  front head      (4->8->1)
    x_data["angle_RS_LS_FH"] = joint_angle(x_data, "RS", "LS", "FH")
    # right shoulder    ->  left shoulder   ->  left elbow      (4->8->9)
    x_data["angle_RS_LS_LE"] = joint_angle(x_data, "RS", "LS", "LE")
    # left shoulder     ->  right shoulder  ->  right elbow     (8->4->6)
    x_data["angle_LS_RS_RE"] = joint_angle(x_data, "LS", "RS", "RE")
    # v sacral          ->  right offset    ->  rear head       (13->5->3)
    x_data["angle_VS_RO_RH"] = joint_angle(x_data, "VS", "RO", "RH")
    # vsacral           ->  top head        ->  front head      (13->2->1)
    x_data["angle_VS_TH_FH"] = joint_angle(x_data, "VS", "TH", "FH")
    # v sacral          ->  left shoulder   ->  left elbow      (13->8->9)
    x_data["angle_VS_LS_LE"] = joint_angle(x_data, "VS", "LS", "LE")
    # v sacral          ->  right shoulder  ->  right elbow     (13->4->6)
    x_data["angle_VS_RS_RE"] = joint_angle(x_data, "VS", "RS", "RE")
    # left asis         ->  left shoulder   ->  left elbow      (12->8->9)
    x_data["angle_LA_LS_LE"] = joint_angle(x_data, "LA", "LS", "LE")
    # right asis        -> right shoulder   ->  right elbow     (11->4->6)
    x_data["angle_RA_RS_RE"] = joint_angle(x_data, "RA", "RS", "RE")
    return x_data

In [20]:
def segmentation(df, overlap_rate, time_window):
    seg_data = []
    #convert overlap rate to step for sliding window
    overlap = int((1 - overlap_rate)*time_window)
    # interpolate
    df = df.interpolate().ffill().fillna(0)
    for i in range(0, len(df)-time_window+1, overlap):
        seg_data.append(df.loc[i:i+time_window-1, :].copy().reset_index(drop=True))
    return seg_data

def get_speed_acc(x_data):
    x_data = x_data.drop(columns=["activity", "subject_id"])
    speed = x_data.diff().fillna(0)
    acc = speed.diff().fillna(0)
    speed.columns = [f"{col}_speed" for col in speed.columns]
    acc.columns = [f"{col}_acc" for col in acc.columns]
    return speed, acc

def get_streams(x_data):
    speed, acc = get_speed_acc(x_data)
    x_data = pd.concat([x_data, speed, acc], axis=1)
    x_data = get_all_joint_angles(x_data)
    return x_data

def get_features(x_data):
    features = []
    cols = x_data.columns.tolist()
    #Calculate features (STD, Average, Max, Min, Median, Variance) for each data columns X Y Z 
    for k in cols:
        features.append(x_data[k].std(ddof=0))
        features.append(np.average(x_data[k]))
        features.append(np.max(x_data[k]))
        features.append(np.min(x_data[k]))
        features.append(np.median(x_data[k]))        
        features.append(np.var(x_data[k]))
        fd = np.abs(fft(np.array(x_data[k])))**2
        features.append(stats.skew(fd))
        features.append(stats.kurtosis(fd))
    return features

def rename_columns(df):
    df.columns = [
        "FH_X", "FH_Y", "FH_Z",     #1
        "TH_X", "TH_Y", "TH_Z",     #2
        "RH_X", "RH_Y", "RH_Z",     #3
        "RS_X", "RS_Y", "RS_Z",     #4
        "RO_X", "RO_Y", "RO_Z",     #5
        "RE_X", "RE_Y", "RE_Z",     #6
        "RW_X", "RW_Y", "RW_Z",     #7
        "LS_X", "LS_Y", "LS_Z",     #8
        "LE_X", "LE_Y", "LE_Z",     #9
        "LW_X", "LW_Y", "LW_Z",     #10
        "RA_X", "RA_Y", "RA_Z",     #11
        "LA_X", "LA_Y", "LA_Z",     #12
        "VS_X", "VS_Y", "VS_Z",     #13
        "subject_id", "activity",   # Other columns
    ]
    return df

def dataloader(overlap, window_size, verbose=True):
    if verbose:
        print("loading the data...", end="\t")
    data_list = []
    file_lengths = {1: [], 2: [], 3: []}
    files = tqdm(glob.glob("../TrainData/*/*/*.csv")) if verbose else glob.glob("../TrainData/*/*/*.csv")
    for file in files:
        tempdf = pd.read_csv(file)
        tempdf = rename_columns(tempdf)
        segmented_data = segmentation(tempdf, overlap, window_size)
        if len(segmented_data)>0:
            person = segmented_data[0].reset_index(drop=True).loc[0, "subject_id"]
            file_lengths[person].append(len(segmented_data))   
        data_list.extend(segmented_data)
    return data_list, file_lengths


def feature_extractor(data_list, verbose=True):
    if verbose:
        print(f"extracting the features...", end="  ")
    X, y = {1:[], 2:[], 3:[]}, {1:[], 2:[], 3:[]}
    num_range = trange(0,len(data_list)) if verbose else range(0,len(data_list))
    for j in num_range:
        #extract only xyz columns
        person = data_list[j].loc[0, "subject_id"]
        x_data = data_list[j].drop(columns=["subject_id","activity"])
        X[person].append(get_features(x_data))
        y[person].append(data_list[j].reset_index(drop=True).loc[0, "activity"])
    return X, y


def majority_voting(predictions, file_lengths):
    filtered_predictions = []
    index = 0
    for length in file_lengths:
        file_pred = predictions[index:index+length]
        modes = mode(file_pred)
        majority_choice = modes.mode[0]
        filtered_predictions.extend([majority_choice]*length)
        index += length
    return filtered_predictions

In [21]:
def LOOCV_train_evaluate(model, overlap_rate, window_size, voting=True, n_repeats=1, verbose=True):
    scores = []
    data_list, file_lengths = dataloader(overlap_rate, window_size, verbose=verbose)
    stream_list = []
    for df in data_list:
        stream_list.append(get_streams(df))
    X, y = feature_extractor(stream_list, verbose=verbose)
    num_range = trange(n_repeats) if verbose else range(n_repeats)
    for _ in num_range:
        for p1, p2, p3 in [(1,2,3), (2,3,1), (3,1,2)]:
            X_test, y_test = X[p1], y[p1]
            X_train = X[p2] + X[p3]
            y_train = y[p2] + y[p3]
            # print(f"training model for person {p1}/3...", end="\t")
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            if voting:
                filtered_pred = majority_voting(pred, file_lengths[p1])
                scores.append(accuracy_score(y_test, filtered_pred))
            else:
                scores.append(accuracy_score(y_test, pred))
    if verbose:
        print(f"\nMean Score: {np.mean(scores)}")
    return scores

In [22]:
def GridSearch(parameters, csvpath = "..", filename="gridCV_results", n_repeats=7, verbose=False, progress=True):
    score_df = pd.DataFrame({"model": [], "window_size": [], "overlap_rate": [], "n_repeats":[], 
                            "avg_score": [], "scores":[]})
    models, window_sizes, overlap_rates = parameters["model"], parameters["window_size"], parameters["overlap_rate"]
    combinations = [(i,j,k) for i in models for j in window_sizes for k in overlap_rates]
    if progress:
        combinations = tqdm(combinations)
    for combination in combinations:
        model, window_size, overlap_rate = combination
        scores = LOOCV_train_evaluate(model, overlap_rate, window_size, n_repeats=n_repeats, verbose=verbose)
        score_df = score_df.append({"model": model.__str__(), "window_size": window_size, 
                    "overlap_rate": overlap_rate, "n_repeats": n_repeats, 
                    "avg_score": np.mean(scores), "scores": scores}, ignore_index=True)
    savepath = f"{csvpath}/{filename}_{str(datetime.now())[:-7]}.csv".replace(":", ".")
    score_df.to_csv(savepath, index=False)
    print(f"result exported to: {savepath}")


In [11]:
parameters = {
    "model": [RFC(600, n_jobs=-1), RFC(1200, n_jobs=-1),
              ETC(600, n_jobs=-1), ETC(1200, n_jobs=-1)],
    "window_size": [3000, 4000],
    "overlap_rate": [0.5, 0.75]
}
GridSearch(parameters, filename="again_with_freq_speed_acc_angles")

100%|██████████| 20/20 [1:17:55<00:00, 233.78s/it]


result exported to: ../gridCV_results_2021-08-14 19.01.56.csv


# LOOCV with frequency, speed, acceleration, distance, angle

In [29]:
def joint_distance(x_data, joint1, joint2):
    """
    returns the distance between two joints. 
    """
    x1, y1, z1 = x_data[f"{joint1}_X"], x_data[f"{joint1}_Y"], x_data[f"{joint1}_Z"]
    x2, y2, z2 = x_data[f"{joint2}_X"], x_data[f"{joint2}_Y"], x_data[f"{joint2}_Z"]
    distance = np.sqrt((x2-x1)**2 + (y2-y1)**2 + (z2-z1)**2)
    return distance

def get_all_joint_distances(x_data):
    """
    calculates all the necessary joint distances from the `x_data`, 
    adds columns to it and returns the modified `x_data`.
    the two joints should not be essentially consecutive, 
    because the distance between two consecutive joints is always constant.
    For example, distance between left_wrist and left_elbow is always constant.
    """
    # joints
    # Front head        ->  left shoulder       (1->8)
    x_data["dist_FH_LS"] = joint_distance(x_data, "FH", "LS")
    # Front head        ->  right shoulder      (1->4)
    x_data["dist_FH_RS"] = joint_distance(x_data, "FH", "RS")
    # left shoulder     ->  left wrist          (8->10)
    x_data["dist_LS_LW"] = joint_distance(x_data, "LS", "LW")
    # right shoulder    ->  right wrist         (4->7)
    x_data["dist_RS_RW"] = joint_distance(x_data, "RS", "RW")
    # v sacral          ->  left elbow          (13->9)
    x_data["dist_VS_LE"] = joint_distance(x_data, "VS", "LE")
    # v sacral          ->  right elbow         (13->6)
    x_data["dist_VS_RE"] = joint_distance(x_data, "VS", "RE")
    # v sacral          ->  left wrist          (13->10)
    x_data["dist_VS_LW"] = joint_distance(x_data, "VS", "LW")
    # v sacral          ->  right wrist         (13->7)
    x_data["dist_VS_RW"] = joint_distance(x_data, "VS", "RW")
    # v sacral          ->  rear head           (13->3)
    x_data["dist_VS_RH"] = joint_distance(x_data, "VS", "RH")
    # v sacral          ->  top head            (13->2)
    x_data["dist_VS_TH"] = joint_distance(x_data, "VS", "TH")
    # left wrist        ->  right wrist         (10->7)
    x_data["dist_LW_RW"] = joint_distance(x_data, "LW", "RW")
    # left asis         ->  left wrist          (12->10)
    x_data["dist_LA_LW"] = joint_distance(x_data, "LA", "LW")
    # right asis        ->  right wrist         (11->7)
    x_data["dist_RA_RW"] = joint_distance(x_data, "RA", "RW")
    # left wrist        ->  top head            (10->2)
    x_data["dist_LW_TH"] = joint_distance(x_data, "LW", "TH")
    # right wrist       ->  top head            (7->2)
    x_data["dist_RW_TH"] = joint_distance(x_data, "RW", "TH")
    # top head          ->  left asis           (2->12)
    x_data["dist_TH_LA"] = joint_distance(x_data, "TH", "LA")
    return x_data

In [30]:
def joint_angle(x_data, joint1, joint2, joint3):
    x1, y1, z1 = x_data[f"{joint1}_X"], x_data[f"{joint1}_Y"], x_data[f"{joint1}_Z"]
    x2, y2, z2 = x_data[f"{joint2}_X"], x_data[f"{joint2}_Y"], x_data[f"{joint2}_Z"]
    x3, y3, z3 = x_data[f"{joint3}_X"], x_data[f"{joint3}_Y"], x_data[f"{joint3}_Z"]
    v1 = np.array([x2-x1, y2-y1, z2-z1]).T
    v2 = np.array([x3-x2, y3-y2, z3-z2]).T
    v1_unit = v1/np.expand_dims(np.linalg.norm(v1, axis=1), axis=1)
    v2_unit = v2/np.expand_dims(np.linalg.norm(v2, axis=1), axis=1)
    angle = np.arccos(np.sum(v1_unit*v2_unit, axis=1)) # dot multiplication
    return angle

def get_all_joint_angles(x_data):
    # joints
    # left shoulder     ->  left elbow      ->  left wrist      (8->9->10)
    x_data["angle_LS_LE_LW"] = joint_angle(x_data, "LS", "LE", "LW")
    # right shoulder    ->  right elbow     ->  right wrist     (4->6->7)
    x_data["angle_RS_RE_RW"] = joint_angle(x_data, "RS", "RE", "RW")
    # right shoulder    ->  left shoulder   ->  front head      (4->8->1)
    x_data["angle_RS_LS_FH"] = joint_angle(x_data, "RS", "LS", "FH")
    # right shoulder    ->  left shoulder   ->  left elbow      (4->8->9)
    x_data["angle_RS_LS_LE"] = joint_angle(x_data, "RS", "LS", "LE")
    # left shoulder     ->  right shoulder  ->  right elbow     (8->4->6)
    x_data["angle_LS_RS_RE"] = joint_angle(x_data, "LS", "RS", "RE")
    # v sacral          ->  right offset    ->  rear head       (13->5->3)
    x_data["angle_VS_RO_RH"] = joint_angle(x_data, "VS", "RO", "RH")
    # vsacral           ->  top head        ->  front head      (13->2->1)
    x_data["angle_VS_TH_FH"] = joint_angle(x_data, "VS", "TH", "FH")
    # v sacral          ->  left shoulder   ->  left elbow      (13->8->9)
    x_data["angle_VS_LS_LE"] = joint_angle(x_data, "VS", "LS", "LE")
    # v sacral          ->  right shoulder  ->  right elbow     (13->4->6)
    x_data["angle_VS_RS_RE"] = joint_angle(x_data, "VS", "RS", "RE")
    # left asis         ->  left shoulder   ->  left elbow      (12->8->9)
    x_data["angle_LA_LS_LE"] = joint_angle(x_data, "LA", "LS", "LE")
    # right asis        -> right shoulder   ->  right elbow     (11->4->6)
    x_data["angle_RA_RS_RE"] = joint_angle(x_data, "RA", "RS", "RE")
    return x_data

In [31]:
def segmentation(df, overlap_rate, time_window):
    seg_data = []
    #convert overlap rate to step for sliding window
    overlap = int((1 - overlap_rate)*time_window)
    # interpolate
    df = df.interpolate().ffill().fillna(0)
    for i in range(0, len(df)-time_window+1, overlap):
        seg_data.append(df.loc[i:i+time_window-1, :].copy().reset_index(drop=True))
    return seg_data

def get_speed_acc(x_data):
    x_data = x_data.drop(columns=["activity", "subject_id"])
    speed = x_data.diff().fillna(0)
    acc = speed.diff().fillna(0)
    speed.columns = [f"{col}_speed" for col in speed.columns]
    acc.columns = [f"{col}_acc" for col in acc.columns]
    return speed, acc

def get_streams(x_data):
    speed, acc = get_speed_acc(x_data)
    x_data = pd.concat([x_data, speed, acc], axis=1)
    x_data = get_all_joint_distances(x_data)
    x_data = get_all_joint_angles(x_data)
    return x_data

def get_features(x_data):
    features = []
    cols = x_data.columns.tolist()
    #Calculate features (STD, Average, Max, Min, Median, Variance) for each data columns X Y Z 
    for k in cols:
        features.append(x_data[k].std(ddof=0))
        features.append(np.average(x_data[k]))
        features.append(np.max(x_data[k]))
        features.append(np.min(x_data[k]))
        features.append(np.median(x_data[k]))        
        features.append(np.var(x_data[k]))
        fd = np.abs(fft(np.array(x_data[k])))**2
        features.append(stats.skew(fd))
        features.append(stats.kurtosis(fd))
    return features

def rename_columns(df):
    df.columns = [
        "FH_X", "FH_Y", "FH_Z",     #1
        "TH_X", "TH_Y", "TH_Z",     #2
        "RH_X", "RH_Y", "RH_Z",     #3
        "RS_X", "RS_Y", "RS_Z",     #4
        "RO_X", "RO_Y", "RO_Z",     #5
        "RE_X", "RE_Y", "RE_Z",     #6
        "RW_X", "RW_Y", "RW_Z",     #7
        "LS_X", "LS_Y", "LS_Z",     #8
        "LE_X", "LE_Y", "LE_Z",     #9
        "LW_X", "LW_Y", "LW_Z",     #10
        "RA_X", "RA_Y", "RA_Z",     #11
        "LA_X", "LA_Y", "LA_Z",     #12
        "VS_X", "VS_Y", "VS_Z",     #13
        "subject_id", "activity",   # Other columns
    ]
    return df

def dataloader(overlap, window_size, verbose=True):
    if verbose:
        print("loading the data...", end="\t")
    data_list = []
    file_lengths = {1: [], 2: [], 3: []}
    files = tqdm(glob.glob("../TrainData/*/*/*.csv")) if verbose else glob.glob("../TrainData/*/*/*.csv")
    for file in files:
        tempdf = pd.read_csv(file)
        tempdf = rename_columns(tempdf)
        segmented_data = segmentation(tempdf, overlap, window_size)
        if len(segmented_data)>0:
            person = segmented_data[0].reset_index(drop=True).loc[0, "subject_id"]
            file_lengths[person].append(len(segmented_data))   
        data_list.extend(segmented_data)
    return data_list, file_lengths


def feature_extractor(data_list, verbose=True):
    if verbose:
        print(f"extracting the features...", end="  ")
    X, y = {1:[], 2:[], 3:[]}, {1:[], 2:[], 3:[]}
    num_range = trange(0,len(data_list)) if verbose else range(0,len(data_list))
    for j in num_range:
        #extract only xyz columns
        person = data_list[j].loc[0, "subject_id"]
        x_data = data_list[j].drop(columns=["subject_id","activity"])
        X[person].append(get_features(x_data))
        y[person].append(data_list[j].reset_index(drop=True).loc[0, "activity"])
    return X, y


def majority_voting(predictions, file_lengths):
    filtered_predictions = []
    index = 0
    for length in file_lengths:
        file_pred = predictions[index:index+length]
        modes = mode(file_pred)
        majority_choice = modes.mode[0]
        filtered_predictions.extend([majority_choice]*length)
        index += length
    return filtered_predictions

In [32]:
def LOOCV_train_evaluate(model, overlap_rate, window_size, voting=True, n_repeats=1, verbose=True):
    scores = []
    data_list, file_lengths = dataloader(overlap_rate, window_size, verbose=verbose)
    stream_list = []
    for df in data_list:
        stream_list.append(get_streams(df))
    X, y = feature_extractor(stream_list, verbose=verbose)
    num_range = trange(n_repeats) if verbose else range(n_repeats)
    for _ in num_range:
        for p1, p2, p3 in [(1,2,3), (2,3,1), (3,1,2)]:
            X_test, y_test = X[p1], y[p1]
            X_train = X[p2] + X[p3]
            y_train = y[p2] + y[p3]
            # print(f"training model for person {p1}/3...", end="\t")
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            if voting:
                filtered_pred = majority_voting(pred, file_lengths[p1])
                scores.append(accuracy_score(y_test, filtered_pred))
            else:
                scores.append(accuracy_score(y_test, pred))
    if verbose:
        print(f"\nMean Score: {np.mean(scores)}")
    return scores

In [33]:
def GridSearch(parameters, csvpath = "..", filename="gridCV_results", n_repeats=7, verbose=False, progress=True):
    score_df = pd.DataFrame({"model": [], "window_size": [], "overlap_rate": [], "n_repeats":[], 
                            "avg_score": [], "scores":[]})
    models, window_sizes, overlap_rates = parameters["model"], parameters["window_size"], parameters["overlap_rate"]
    combinations = [(i,j,k) for i in models for j in window_sizes for k in overlap_rates]
    if progress:
        combinations = tqdm(combinations)
    for combination in combinations:
        model, window_size, overlap_rate = combination
        scores = LOOCV_train_evaluate(model, overlap_rate, window_size, n_repeats=n_repeats, verbose=verbose)
        score_df = score_df.append({"model": model.__str__(), "window_size": window_size, 
                    "overlap_rate": overlap_rate, "n_repeats": n_repeats, 
                    "avg_score": np.mean(scores), "scores": scores}, ignore_index=True)
    savepath = f"{csvpath}/{filename}_{str(datetime.now())[:-7]}.csv".replace(":", ".")
    score_df.to_csv(savepath, index=False)
    print(f"result exported to: {savepath}")


In [35]:
parameters = {
    "model": [RFC(600, n_jobs=-1), RFC(1200, n_jobs=-1),
                ETC(600, n_jobs=-1), ETC(1200, n_jobs=-1)],
    "window_size": [3000, 4000],
    "overlap_rate": [0.5, 0.75]
}
GridSearch(parameters, filename="again_with_speed_acc_angles_distances")

100%|██████████| 16/16 [1:16:08<00:00, 285.52s/it]

result exported to: ../again_with_speed_acc_angles_distances_2021-08-16 15.42.02.csv





# LOOCV with frequency, speed, acceleration, distance, plane

In [2]:
def joint_distance(x_data, joint1, joint2):
    """
    returns the distance between two joints. 
    """
    x1, y1, z1 = x_data[f"{joint1}_X"], x_data[f"{joint1}_Y"], x_data[f"{joint1}_Z"]
    x2, y2, z2 = x_data[f"{joint2}_X"], x_data[f"{joint2}_Y"], x_data[f"{joint2}_Z"]
    distance = np.sqrt((x2-x1)**2 + (y2-y1)**2 + (z2-z1)**2)
    return distance

def get_all_joint_distances(x_data):
    """
    calculates all the necessary joint distances from the `x_data`, 
    adds columns to it and returns the modified `x_data`.
    the two joints should not be essentially consecutive, 
    because the distance between two consecutive joints is always constant.
    For example, distance between left_wrist and left_elbow is always constant.
    """
    # joints
    # Front head        ->  left shoulder       (1->8)
    x_data["dist_FH_LS"] = joint_distance(x_data, "FH", "LS")
    # Front head        ->  right shoulder      (1->4)
    x_data["dist_FH_RS"] = joint_distance(x_data, "FH", "RS")
    # left shoulder     ->  left wrist          (8->10)
    x_data["dist_LS_LW"] = joint_distance(x_data, "LS", "LW")
    # right shoulder    ->  right wrist         (4->7)
    x_data["dist_RS_RW"] = joint_distance(x_data, "RS", "RW")
    # v sacral          ->  left elbow          (13->9)
    x_data["dist_VS_LE"] = joint_distance(x_data, "VS", "LE")
    # v sacral          ->  right elbow         (13->6)
    x_data["dist_VS_RE"] = joint_distance(x_data, "VS", "RE")
    # v sacral          ->  left wrist          (13->10)
    x_data["dist_VS_LW"] = joint_distance(x_data, "VS", "LW")
    # v sacral          ->  right wrist         (13->7)
    x_data["dist_VS_RW"] = joint_distance(x_data, "VS", "RW")
    # v sacral          ->  rear head           (13->3)
    x_data["dist_VS_RH"] = joint_distance(x_data, "VS", "RH")
    # v sacral          ->  top head            (13->2)
    x_data["dist_VS_TH"] = joint_distance(x_data, "VS", "TH")
    # left wrist        ->  right wrist         (10->7)
    x_data["dist_LW_RW"] = joint_distance(x_data, "LW", "RW")
    # left asis         ->  left wrist          (12->10)
    x_data["dist_LA_LW"] = joint_distance(x_data, "LA", "LW")
    # right asis        ->  right wrist         (11->7)
    x_data["dist_RA_RW"] = joint_distance(x_data, "RA", "RW")
    # left wrist        ->  top head            (10->2)
    x_data["dist_LW_TH"] = joint_distance(x_data, "LW", "TH")
    # right wrist       ->  top head            (7->2)
    x_data["dist_RW_TH"] = joint_distance(x_data, "RW", "TH")
    # top head          ->  left asis           (2->12)
    x_data["dist_TH_LA"] = joint_distance(x_data, "TH", "LA")
    return x_data

In [5]:
def plane_angles(x_data, joint1, joint2):
    x1, y1, z1 = x_data[f"{joint1}_X"], x_data[f"{joint1}_Y"], x_data[f"{joint1}_Z"]
    x2, y2, z2 = x_data[f"{joint2}_X"], x_data[f"{joint2}_Y"], x_data[f"{joint2}_Z"]
    v = np.array([x2-x1, y2-y1, z2-z1]).T
    vx = np.array([1, 0, 0])
    vy = np.array([0, 1, 0])
    vz = np.array([0, 0, 1])
    v_unit = v/np.expand_dims(np.linalg.norm(v, axis=1), axis=1)
    angle_x = np.arccos(np.sum(v_unit*vx, axis=1)) 
    angle_y = np.arccos(np.sum(v_unit*vy, axis=1)) 
    angle_z = np.arccos(np.sum(v_unit*vz, axis=1)) 
    return angle_x, angle_y, angle_z

def get_all_angles_with_plane(x_data):
    # joints
    # arm angles: left shoulder     ->  left elbow 
    x_data["angle_x_LS_LE"], x_data["angle_y_LS_LE"], x_data["angle_z_LS_LE"] = plane_angles(x_data, "LS", "LE")
    # forearm angles: left elbow    ->  left  wrist
    x_data["angle_x_LE_LW"], x_data["angle_y_LE_LW"], x_data["angle_z_LE_LW"] = plane_angles(x_data, "LE", "LW")
    # arm angles: right shoulder    ->  right elbow 
    x_data["angle_x_RS_RE"], x_data["angle_y_RS_RE"], x_data["angle_z_RS_RE"] = plane_angles(x_data, "RS", "RE")
    # forearm angles: right elbow   ->  right  wrist
    x_data["angle_x_RE_RW"], x_data["angle_y_RE_RW"], x_data["angle_z_RE_RW"] = plane_angles(x_data, "RE", "RW")
    # backbone angles: v sacral     ->  rear head
    x_data["angle_x_VS_RH"], x_data["angle_y_VS_RH"], x_data["angle_z_VS_RH"] = plane_angles(x_data, "VS", "RH")
    return x_data

In [6]:
def segmentation(df, overlap_rate, time_window):
    seg_data = []
    #convert overlap rate to step for sliding window
    overlap = int((1 - overlap_rate)*time_window)
    # interpolate
    df = df.interpolate().ffill().fillna(0)
    for i in range(0, len(df)-time_window+1, overlap):
        seg_data.append(df.loc[i:i+time_window-1, :].copy().reset_index(drop=True))
    return seg_data

def get_speed_acc(x_data):
    x_data = x_data.drop(columns=["activity", "subject_id"])
    speed = x_data.diff().fillna(0)
    acc = speed.diff().fillna(0)
    speed.columns = [f"{col}_speed" for col in speed.columns]
    acc.columns = [f"{col}_acc" for col in acc.columns]
    return speed, acc

def get_streams(x_data):
    speed, acc = get_speed_acc(x_data)
    x_data = pd.concat([x_data, speed, acc], axis=1)
    x_data = get_all_joint_distances(x_data)
    x_data = get_all_angles_with_plane(x_data)
    return x_data

def get_features(x_data):
    features = []
    cols = x_data.columns.tolist()
    #Calculate features (STD, Average, Max, Min, Median, Variance) for each data columns X Y Z 
    for k in cols:
        features.append(x_data[k].std(ddof=0))
        features.append(np.average(x_data[k]))
        features.append(np.max(x_data[k]))
        features.append(np.min(x_data[k]))
        features.append(np.median(x_data[k]))        
        features.append(np.var(x_data[k]))
        fd = np.abs(fft(np.array(x_data[k])))**2
        features.append(stats.skew(fd))
        features.append(stats.kurtosis(fd))
    return features

def rename_columns(df):
    df.columns = [
        "FH_X", "FH_Y", "FH_Z",     #1
        "TH_X", "TH_Y", "TH_Z",     #2
        "RH_X", "RH_Y", "RH_Z",     #3
        "RS_X", "RS_Y", "RS_Z",     #4
        "RO_X", "RO_Y", "RO_Z",     #5
        "RE_X", "RE_Y", "RE_Z",     #6
        "RW_X", "RW_Y", "RW_Z",     #7
        "LS_X", "LS_Y", "LS_Z",     #8
        "LE_X", "LE_Y", "LE_Z",     #9
        "LW_X", "LW_Y", "LW_Z",     #10
        "RA_X", "RA_Y", "RA_Z",     #11
        "LA_X", "LA_Y", "LA_Z",     #12
        "VS_X", "VS_Y", "VS_Z",     #13
        "subject_id", "activity",   # Other columns
    ]
    return df

def dataloader(overlap, window_size, verbose=True):
    if verbose:
        print("loading the data...", end="\t")
    data_list = []
    file_lengths = {1: [], 2: [], 3: []}
    files = tqdm(glob.glob("../TrainData/*/*/*.csv")) if verbose else glob.glob("../TrainData/*/*/*.csv")
    for file in files:
        tempdf = pd.read_csv(file)
        tempdf = rename_columns(tempdf)
        segmented_data = segmentation(tempdf, overlap, window_size)
        if len(segmented_data)>0:
            person = segmented_data[0].reset_index(drop=True).loc[0, "subject_id"]
            file_lengths[person].append(len(segmented_data))   
        data_list.extend(segmented_data)
    return data_list, file_lengths


def feature_extractor(data_list, verbose=True):
    if verbose:
        print(f"extracting the features...", end="  ")
    X, y = {1:[], 2:[], 3:[]}, {1:[], 2:[], 3:[]}
    num_range = trange(0,len(data_list)) if verbose else range(0,len(data_list))
    for j in num_range:
        #extract only xyz columns
        person = data_list[j].loc[0, "subject_id"]
        x_data = data_list[j].drop(columns=["subject_id","activity"])
        X[person].append(get_features(x_data))
        y[person].append(data_list[j].reset_index(drop=True).loc[0, "activity"])
    return X, y


def majority_voting(predictions, file_lengths):
    filtered_predictions = []
    index = 0
    for length in file_lengths:
        file_pred = predictions[index:index+length]
        modes = mode(file_pred)
        majority_choice = modes.mode[0]
        filtered_predictions.extend([majority_choice]*length)
        index += length
    return filtered_predictions

In [7]:
def LOOCV_train_evaluate(model, overlap_rate, window_size, voting=True, n_repeats=1, verbose=True):
    scores = []
    data_list, file_lengths = dataloader(overlap_rate, window_size, verbose=verbose)
    stream_list = []
    for df in data_list:
        stream_list.append(get_streams(df))
    X, y = feature_extractor(stream_list, verbose=verbose)
    num_range = trange(n_repeats) if verbose else range(n_repeats)
    for _ in num_range:
        for p1, p2, p3 in [(1,2,3), (2,3,1), (3,1,2)]:
            X_test, y_test = X[p1], y[p1]
            X_train = X[p2] + X[p3]
            y_train = y[p2] + y[p3]
            # print(f"training model for person {p1}/3...", end="\t")
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            if voting:
                filtered_pred = majority_voting(pred, file_lengths[p1])
                scores.append(accuracy_score(y_test, filtered_pred))
            else:
                scores.append(accuracy_score(y_test, pred))
    if verbose:
        print(f"\nMean Score: {np.mean(scores)}")
    return scores

In [8]:
def GridSearch(parameters, csvpath = "..", filename="gridCV_results", n_repeats=7, verbose=False, progress=True):
    score_df = pd.DataFrame({"model": [], "window_size": [], "overlap_rate": [], "n_repeats":[], 
                            "avg_score": [], "scores":[]})
    models, window_sizes, overlap_rates = parameters["model"], parameters["window_size"], parameters["overlap_rate"]
    combinations = [(i,j,k) for i in models for j in window_sizes for k in overlap_rates]
    if progress:
        combinations = tqdm(combinations)
    for combination in combinations:
        model, window_size, overlap_rate = combination
        scores = LOOCV_train_evaluate(model, overlap_rate, window_size, n_repeats=n_repeats, verbose=verbose)
        score_df = score_df.append({"model": model.__str__(), "window_size": window_size, 
                    "overlap_rate": overlap_rate, "n_repeats": n_repeats, 
                    "avg_score": np.mean(scores), "scores": scores}, ignore_index=True)
    savepath = f"{csvpath}/{filename}_{str(datetime.now())[:-7]}.csv".replace(":", ".")
    score_df.to_csv(savepath, index=False)
    print(f"result exported to: {savepath}")


In [18]:
parameters = {
    "model": [RFC(600, n_jobs=-1), RFC(1000, n_jobs=-1), RFC(1500, n_jobs=-1),
                ETC(1000, n_jobs=-1), ETC(1500, n_jobs=-1)],
    "window_size": [3000, 4000, 5000],
    "overlap_rate": [0.5, 0.75]
}
GridSearch(parameters, filename="with_speed_acc_distances_plane")

100%|██████████| 30/30 [3:00:51<00:00, 361.72s/it]


result exported to: ../with_speed_acc_distances_plane_2021-08-15 20.50.30.csv


# LOOCV with ex_features, ex_frequency, speed, acceleration, distance

In [2]:
def joint_distance(x_data, joint1, joint2):
    """
    returns the distance between two joints. 
    """
    x1, y1, z1 = x_data[f"{joint1}_X"], x_data[f"{joint1}_Y"], x_data[f"{joint1}_Z"]
    x2, y2, z2 = x_data[f"{joint2}_X"], x_data[f"{joint2}_Y"], x_data[f"{joint2}_Z"]
    distance = np.sqrt((x2-x1)**2 + (y2-y1)**2 + (z2-z1)**2)
    return distance

def get_all_joint_distances(x_data):
    """
    calculates all the necessary joint distances from the `x_data`, 
    adds columns to it and returns the modified `x_data`.
    the two joints should not be essentially consecutive, 
    because the distance between two consecutive joints is always constant.
    For example, distance between left_wrist and left_elbow is always constant.
    """
    # joints
    # Front head        ->  left shoulder       (1->8)
    x_data["dist_FH_LS"] = joint_distance(x_data, "FH", "LS")
    # Front head        ->  right shoulder      (1->4)
    x_data["dist_FH_RS"] = joint_distance(x_data, "FH", "RS")
    # left shoulder     ->  left wrist          (8->10)
    x_data["dist_LS_LW"] = joint_distance(x_data, "LS", "LW")
    # right shoulder    ->  right wrist         (4->7)
    x_data["dist_RS_RW"] = joint_distance(x_data, "RS", "RW")
    # v sacral          ->  left elbow          (13->9)
    x_data["dist_VS_LE"] = joint_distance(x_data, "VS", "LE")
    # v sacral          ->  right elbow         (13->6)
    x_data["dist_VS_RE"] = joint_distance(x_data, "VS", "RE")
    # v sacral          ->  left wrist          (13->10)
    x_data["dist_VS_LW"] = joint_distance(x_data, "VS", "LW")
    # v sacral          ->  right wrist         (13->7)
    x_data["dist_VS_RW"] = joint_distance(x_data, "VS", "RW")
    # v sacral          ->  rear head           (13->3)
    x_data["dist_VS_RH"] = joint_distance(x_data, "VS", "RH")
    # v sacral          ->  top head            (13->2)
    x_data["dist_VS_TH"] = joint_distance(x_data, "VS", "TH")
    # left wrist        ->  right wrist         (10->7)
    x_data["dist_LW_RW"] = joint_distance(x_data, "LW", "RW")
    # left asis         ->  left wrist          (12->10)
    x_data["dist_LA_LW"] = joint_distance(x_data, "LA", "LW")
    # right asis        ->  right wrist         (11->7)
    x_data["dist_RA_RW"] = joint_distance(x_data, "RA", "RW")
    # left wrist        ->  top head            (10->2)
    x_data["dist_LW_TH"] = joint_distance(x_data, "LW", "TH")
    # right wrist       ->  top head            (7->2)
    x_data["dist_RW_TH"] = joint_distance(x_data, "RW", "TH")
    # top head          ->  left asis           (2->12)
    x_data["dist_TH_LA"] = joint_distance(x_data, "TH", "LA")
    return x_data

In [14]:
def num_zero_crossings(series):
    arr = series.to_numpy()
    return ((arr[:-1] * arr[1:]) < 0).sum()

def get_peaks(series):
    peaks, _ = signal.find_peaks(series)
    num_peaks = len(peaks)
    if not num_peaks:
        return 0, 0, 0, 0
    mean_diff = np.diff(peaks).mean() if num_peaks>1 else 0
    mean_height = series[peaks].mean()
    std_height = series[peaks].std()
    return num_peaks, mean_diff, mean_height, std_height

In [15]:
def segmentation(df, overlap_rate, time_window):
    seg_data = []
    #convert overlap rate to step for sliding window
    overlap = int((1 - overlap_rate)*time_window)
    # interpolate
    df = df.interpolate().ffill().fillna(0)
    for i in range(0, len(df)-time_window+1, overlap):
        seg_data.append(df.loc[i:i+time_window-1, :].copy().reset_index(drop=True))
    return seg_data

def get_speed_acc(x_data):
    x_data = x_data.drop(columns=["activity", "subject_id"])
    speed = x_data.diff().fillna(0)
    acc = speed.diff().fillna(0)
    speed.columns = [f"{col}_speed" for col in speed.columns]
    acc.columns = [f"{col}_acc" for col in acc.columns]
    return speed, acc

def get_streams(x_data):
    speed, acc = get_speed_acc(x_data)
    x_data = pd.concat([x_data, speed, acc], axis=1)
    x_data = get_all_joint_distances(x_data)
    return x_data

def get_features(x_data):
    features = []
    cols = x_data.columns.tolist()
    #Calculate features (STD, Average, Max, Min, Median, Variance) for each data columns X Y Z 
    for k in cols:
        # time domain features
        features.append(x_data[k].std(ddof=0))
        features.append(np.average(x_data[k]))
        features.append(np.max(x_data[k]))
        features.append(np.min(x_data[k]))
        features.append(np.median(x_data[k]))        
        features.append(np.var(x_data[k]))
        # extra time features
        features.append(num_zero_crossings(x_data[k]))  # num of zero crossing
        features.extend(get_peaks(x_data[k]))           # peak related features
        features.append((x_data[k]**2).sum())           # energy
        features.append(stats.iqr(x_data[k]))           # inter quartile range

        # freq domain features
        fd = np.abs(fft(np.array(x_data[k])))**2
        features.append(stats.skew(fd))
        features.append(stats.kurtosis(fd))
        # extra freq domain features
        features.append(fd.std(ddof=0))
        features.append(np.average(fd))
        features.append(np.max(fd))
        features.append(np.min(fd))
        features.append(np.median(fd))
        features.append(stats.iqr(fd))           # inter quartile range
    return features

def rename_columns(df):
    df.columns = [
        "FH_X", "FH_Y", "FH_Z",     #1
        "TH_X", "TH_Y", "TH_Z",     #2
        "RH_X", "RH_Y", "RH_Z",     #3
        "RS_X", "RS_Y", "RS_Z",     #4
        "RO_X", "RO_Y", "RO_Z",     #5
        "RE_X", "RE_Y", "RE_Z",     #6
        "RW_X", "RW_Y", "RW_Z",     #7
        "LS_X", "LS_Y", "LS_Z",     #8
        "LE_X", "LE_Y", "LE_Z",     #9
        "LW_X", "LW_Y", "LW_Z",     #10
        "RA_X", "RA_Y", "RA_Z",     #11
        "LA_X", "LA_Y", "LA_Z",     #12
        "VS_X", "VS_Y", "VS_Z",     #13
        "subject_id", "activity",   # Other columns
    ]
    return df

def dataloader(overlap, window_size, verbose=True):
    if verbose:
        print("loading the data...", end="\t")
    data_list = []
    file_lengths = {1: [], 2: [], 3: []}
    files = tqdm(glob.glob("../TrainData/*/*/*.csv")) if verbose else glob.glob("../TrainData/*/*/*.csv")
    for file in files:
        tempdf = pd.read_csv(file)
        tempdf = rename_columns(tempdf)
        segmented_data = segmentation(tempdf, overlap, window_size)
        if len(segmented_data)>0:
            person = segmented_data[0].reset_index(drop=True).loc[0, "subject_id"]
            file_lengths[person].append(len(segmented_data))   
        data_list.extend(segmented_data)
    return data_list, file_lengths


def feature_extractor(data_list, verbose=True):
    if verbose:
        print(f"extracting the features...", end="  ")
    X, y = {1:[], 2:[], 3:[]}, {1:[], 2:[], 3:[]}
    num_range = trange(0,len(data_list)) if verbose else range(0,len(data_list))
    for j in num_range:
        #extract only xyz columns
        person = data_list[j].loc[0, "subject_id"]
        x_data = data_list[j].drop(columns=["subject_id","activity"])
        X[person].append(get_features(x_data))
        y[person].append(data_list[j].reset_index(drop=True).loc[0, "activity"])
    return X, y


def majority_voting(predictions, file_lengths):
    filtered_predictions = []
    index = 0
    for length in file_lengths:
        file_pred = predictions[index:index+length]
        modes = mode(file_pred)
        majority_choice = modes.mode[0]
        filtered_predictions.extend([majority_choice]*length)
        index += length
    return filtered_predictions

In [16]:
def LOOCV_train_evaluate(model, overlap_rate, window_size, voting=True, n_repeats=1, verbose=True):
    scores = []
    data_list, file_lengths = dataloader(overlap_rate, window_size, verbose=verbose)
    stream_list = []
    for df in data_list:
        stream_list.append(get_streams(df))
    X, y = feature_extractor(stream_list, verbose=verbose)
    num_range = trange(n_repeats) if verbose else range(n_repeats)
    for _ in num_range:
        for p1, p2, p3 in [(1,2,3), (2,3,1), (3,1,2)]:
            X_test, y_test = X[p1], y[p1]
            X_train = X[p2] + X[p3]
            y_train = y[p2] + y[p3]
            # print(f"training model for person {p1}/3...", end="\t")
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            if voting:
                filtered_pred = majority_voting(pred, file_lengths[p1])
                scores.append(accuracy_score(y_test, filtered_pred))
            else:
                scores.append(accuracy_score(y_test, pred))
    if verbose:
        print(f"\nMean Score: {np.mean(scores)}")
    return scores

In [17]:
def GridSearch(parameters, csvpath = "..", filename="gridCV_results", n_repeats=7, verbose=False, progress=True):
    score_df = pd.DataFrame({"model": [], "window_size": [], "overlap_rate": [], "n_repeats":[], 
                            "avg_score": [], "scores":[]})
    models, window_sizes, overlap_rates = parameters["model"], parameters["window_size"], parameters["overlap_rate"]
    combinations = [(i,j,k) for i in models for j in window_sizes for k in overlap_rates]
    if progress:
        combinations = tqdm(combinations)
    for combination in combinations:
        model, window_size, overlap_rate = combination
        scores = LOOCV_train_evaluate(model, overlap_rate, window_size, n_repeats=n_repeats, verbose=verbose)
        score_df = score_df.append({"model": model.__str__(), "window_size": window_size, 
                    "overlap_rate": overlap_rate, "n_repeats": n_repeats, 
                    "avg_score": np.mean(scores), "scores": scores}, ignore_index=True)
    savepath = f"{csvpath}/{filename}_{str(datetime.now())[:-7]}.csv".replace(":", ".")
    score_df.to_csv(savepath, index=False)
    print(f"result exported to: {savepath}")


In [18]:
parameters = {
    "model": [RFC(600, n_jobs=-1), RFC(1200, n_jobs=-1),
            ETC(1000, n_jobs=-1), ETC(1500, n_jobs=-1)],
    "window_size": [3000, 4000],
    "overlap_rate": [0.5, 0.75]
}
GridSearch(parameters, filename="with_exfeatures_exfreq_speed_acc_distance")

100%|██████████| 16/16 [1:49:00<00:00, 408.77s/it]

result exported to: ../with_exfeatures_exfreq_speed_acc_distance_2021-08-16 03.11.01.csv



