In [46]:
import glob
from datetime import datetime
import os

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats, signal
from scipy.stats import mode
from scipy.fft import fft
from sklearn.ensemble import RandomForestClassifier as RFC, ExtraTreesClassifier as ETC
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_val_predict
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC

import lightgbm as lgb
import xgboost as xgb

from tqdm import tqdm, trange

# Generic functions (usable for all test cases)

## Core functions (Change path here)

In [2]:
def segmentation(df, overlap_rate, time_window):
    seg_data = []
    #convert overlap rate to step for sliding window
    overlap = int((1 - overlap_rate)*time_window)
    # interpolate
    df = df.interpolate().ffill().fillna(0)
    for i in range(0, len(df)-time_window+1, overlap):
        seg_data.append(df.loc[i:i+time_window-1, :].copy().reset_index(drop=True))
    return seg_data


def rename_columns(df):
    df.columns = [
        "FH_X", "FH_Y", "FH_Z",     #1
        "TH_X", "TH_Y", "TH_Z",     #2
        "RH_X", "RH_Y", "RH_Z",     #3
        "RS_X", "RS_Y", "RS_Z",     #4
        "RO_X", "RO_Y", "RO_Z",     #5
        "RE_X", "RE_Y", "RE_Z",     #6
        "RW_X", "RW_Y", "RW_Z",     #7
        "LS_X", "LS_Y", "LS_Z",     #8
        "LE_X", "LE_Y", "LE_Z",     #9
        "LW_X", "LW_Y", "LW_Z",     #10
        "RA_X", "RA_Y", "RA_Z",     #11
        "LA_X", "LA_Y", "LA_Z",     #12
        "VS_X", "VS_Y", "VS_Z",     #13
        "subject_id", "activity",   # Other columns
    ]
    return df


def dataloader(overlap, window_size, verbose=True):
    if verbose:
        print("loading the data...", end="\t")
    data_list = []
    file_lengths = {1: [], 2: [], 3: []}
    ##### CHANGE PATH ######
    files = tqdm(glob.glob("../TrainData/*/*/*.csv")) if verbose else glob.glob("../TrainData/*/*/*.csv")
    ##### ##### ##### ######
    for file in files:
        tempdf = pd.read_csv(file)
        tempdf = rename_columns(tempdf)
        segmented_data = segmentation(tempdf, overlap, window_size)
        if len(segmented_data)>0:
            person = segmented_data[0].reset_index(drop=True).loc[0, "subject_id"]
            file_lengths[person].append(len(segmented_data))   
        data_list.extend(segmented_data)
    return data_list, file_lengths


def feature_extractor(data_list, verbose=True):
    if verbose:
        print(f"extracting the features...", end="  ")
    X, y = {1:[], 2:[], 3:[]}, {1:[], 2:[], 3:[]}
    num_range = trange(0,len(data_list)) if verbose else range(0,len(data_list))
    for j in num_range:
        #extract only xyz columns
        person = data_list[j].loc[0, "subject_id"]
        x_data = data_list[j].drop(columns=["subject_id","activity"])
        X[person].append(get_features(x_data))
        y[person].append(data_list[j].reset_index(drop=True).loc[0, "activity"])
    return X, y


def majority_voting(predictions, file_lengths):
    filtered_predictions = []
    index = 0
    for length in file_lengths:
        file_pred = predictions[index:index+length]
        modes = mode(file_pred)
        majority_choice = modes.mode[0]
        filtered_predictions.extend([majority_choice]*length)
        index += length
    return filtered_predictions

In [37]:
def cv_10_fold(model, X, y, n_repeats=10, verbose=True):
    scores = []
    num_range = trange(n_repeats) if verbose else range(n_repeats)
    for _ in num_range:
        X_data = X[1] + X[2] + X[3]
        y_data = y[1] + y[2] + y[3]
        X_data = np.array(X_data)
        y_data = np.array(y_data)
        scaler = MinMaxScaler()
        X_data_scaled = scaler.fit_transform(X_data)
        # print(f"training model for person {p1}/3...", end="\t")
        cv = StratifiedKFold(n_splits=10, shuffle=False)
        n_scores = cross_val_score(model, X_data_scaled, y_data, scoring='accuracy', cv=cv, 
                            n_jobs=-1, error_score='raise')
        scores.extend(n_scores)
    if verbose:
        print(f"Mean Score: {np.mean(scores)}")
        print(f"Std Score: {np.std(scores)}")
        print(f"Min Score: {np.min(scores)}")
        print(f"Max Score: {np.max(scores)}")
    return scores


def cv_10_fold_pred(model, X, y, n_repeats=1, verbose=True):
    num_range = trange(n_repeats) if verbose else range(n_repeats)
    for _ in num_range:
        X_data = X[1] + X[2] + X[3]
        y_data = y[1] + y[2] + y[3]
        X_data = np.array(X_data)
        y_data = np.array(y_data)
        scaler = MinMaxScaler()
        X_data_scaled = scaler.fit_transform(X_data)
        # print(f"training model for person {p1}/3...", end="\t")
        cv = StratifiedKFold(n_splits=10, shuffle=False)
        y_pred = cross_val_predict(model, X_data, y_data, cv=cv, 
                            n_jobs=-1)
    return y_data, y_pred

In [4]:
def get_processed_dataset(overlap_rate, window_size, verbose=True):
    data_list, file_lengths = dataloader(overlap_rate, window_size, verbose=verbose)
    stream_list = []
    for df in data_list:
        stream_list.append(get_streams(df))
    X, y = feature_extractor(stream_list, verbose=verbose)
    return X, y, file_lengths


def model_evaluator(model, X, y, file_lengths, n_repeats=10, voting=True, verbose=True):
    scores = []
    num_range = trange(n_repeats) if verbose else range(n_repeats)
    for _ in num_range:
        for p1, p2, p3 in [(1,2,3), (2,3,1), (3,1,2)]:
            X_test, y_test = X[p1], y[p1]
            X_train = X[p2] + X[p3]
            y_train = y[p2] + y[p3]
            # print(f"training model for person {p1}/3...", end="\t")
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            if voting:
                filtered_pred = majority_voting(pred, file_lengths[p1])
                scores.append(accuracy_score(y_test, filtered_pred))
            else:
                scores.append(accuracy_score(y_test, pred))
    if verbose:
        print(f"Mean Score: {np.mean(scores)}")
        print(f"Std Score: {np.std(scores)}")
        print(f"Min Score: {np.min(scores)}")
        print(f"Max Score: {np.max(scores)}")
    return scores


def scaled_model_evaluator(model, X, y, file_lengths, n_repeats=10, voting=True, return_pred=False, verbose=True):
    scores = []
    y_trues = []
    y_preds = []
    num_range = trange(n_repeats) if verbose else range(n_repeats)
    for _ in num_range:
        for p1, p2, p3 in [(1,2,3), (2,3,1), (3,1,2)]:
            X_test, y_test = X[p1], y[p1]
            X_train = X[p2] + X[p3]
            y_train = y[p2] + y[p3]
            scaler = MinMaxScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.fit_transform(X_test)
            # print(f"training model for person {p1}/3...", end="\t")
            model.fit(X_train_scaled, y_train)
            pred = model.predict(X_test_scaled)
            y_trues.append(y_test)
            y_preds.append(pred)
            if voting:
                filtered_pred = majority_voting(pred, file_lengths[p1])
                scores.append(accuracy_score(y_test, filtered_pred))
            else:
                scores.append(accuracy_score(y_test, pred))
    if verbose:
        print(f"Mean Score: {np.mean(scores)}")
        print(f"Std Score: {np.std(scores)}")
        print(f"Min Score: {np.min(scores)}")
        print(f"Max Score: {np.max(scores)}")
    if return_pred:
        return scores, y_trues, y_preds
    else:
        return scores



## Stream Extractors

In [5]:
def get_speed_acc(x_data):
    x_data = x_data.drop(columns=["activity", "subject_id"])
    speed = x_data.diff().fillna(0)
    acc = speed.diff().fillna(0)
    speed.columns = [f"{col}_speed" for col in speed.columns]
    acc.columns = [f"{col}_acc" for col in acc.columns]
    return speed, acc

def get_speed_acc_jerk(x_data):
    x_data = x_data.drop(columns=["activity", "subject_id"])
    speed = x_data.diff().fillna(0)
    acc = speed.diff().fillna(0)
    jerk = acc.diff().fillna(0)
    speed.columns = [f"{col}_speed" for col in speed.columns]
    acc.columns = [f"{col}_acc" for col in acc.columns]
    jerk.columns = [f"{col}_jerk" for col in acc.columns]
    return speed, acc, jerk

In [6]:
def joint_distance(x_data, joint1, joint2):
    """
    returns the distance between two joints. 
    """
    x1, y1, z1 = x_data[f"{joint1}_X"], x_data[f"{joint1}_Y"], x_data[f"{joint1}_Z"]
    x2, y2, z2 = x_data[f"{joint2}_X"], x_data[f"{joint2}_Y"], x_data[f"{joint2}_Z"]
    distance = np.sqrt((x2-x1)**2 + (y2-y1)**2 + (z2-z1)**2)
    return distance

def get_all_joint_distances(x_data):
    """
    calculates all the necessary joint distances from the `x_data`, 
    adds columns to it and returns the modified `x_data`.
    the two joints should not be essentially consecutive, 
    because the distance between two consecutive joints is always constant.
    For example, distance between left_wrist and left_elbow is always constant.
    """
    # joints
    # Front head        ->  left shoulder       (1->8)
    x_data["dist_FH_LS"] = joint_distance(x_data, "FH", "LS")
    # Front head        ->  right shoulder      (1->4)
    x_data["dist_FH_RS"] = joint_distance(x_data, "FH", "RS")
    # left shoulder     ->  left wrist          (8->10)
    x_data["dist_LS_LW"] = joint_distance(x_data, "LS", "LW")
    # right shoulder    ->  right wrist         (4->7)
    x_data["dist_RS_RW"] = joint_distance(x_data, "RS", "RW")
    # v sacral          ->  left elbow          (13->9)
    x_data["dist_VS_LE"] = joint_distance(x_data, "VS", "LE")
    # v sacral          ->  right elbow         (13->6)
    x_data["dist_VS_RE"] = joint_distance(x_data, "VS", "RE")
    # v sacral          ->  left wrist          (13->10)
    x_data["dist_VS_LW"] = joint_distance(x_data, "VS", "LW")
    # v sacral          ->  right wrist         (13->7)
    x_data["dist_VS_RW"] = joint_distance(x_data, "VS", "RW")
    # v sacral          ->  rear head           (13->3)
    x_data["dist_VS_RH"] = joint_distance(x_data, "VS", "RH")
    # v sacral          ->  top head            (13->2)
    x_data["dist_VS_TH"] = joint_distance(x_data, "VS", "TH")
    # left wrist        ->  right wrist         (10->7)
    x_data["dist_LW_RW"] = joint_distance(x_data, "LW", "RW")
    # left asis         ->  left wrist          (12->10)
    x_data["dist_LA_LW"] = joint_distance(x_data, "LA", "LW")
    # right asis        ->  right wrist         (11->7)
    x_data["dist_RA_RW"] = joint_distance(x_data, "RA", "RW")
    # left wrist        ->  top head            (10->2)
    x_data["dist_LW_TH"] = joint_distance(x_data, "LW", "TH")
    # right wrist       ->  top head            (7->2)
    x_data["dist_RW_TH"] = joint_distance(x_data, "RW", "TH")
    # top head          ->  left asis           (2->12)
    x_data["dist_TH_LA"] = joint_distance(x_data, "TH", "LA")
    return x_data



In [7]:
def joint_angle(x_data, joint1, joint2, joint3):
    x1, y1, z1 = x_data[f"{joint1}_X"], x_data[f"{joint1}_Y"], x_data[f"{joint1}_Z"]
    x2, y2, z2 = x_data[f"{joint2}_X"], x_data[f"{joint2}_Y"], x_data[f"{joint2}_Z"]
    x3, y3, z3 = x_data[f"{joint3}_X"], x_data[f"{joint3}_Y"], x_data[f"{joint3}_Z"]
    v1 = np.array([x2-x1, y2-y1, z2-z1]).T
    v2 = np.array([x3-x2, y3-y2, z3-z2]).T
    v1_unit = v1/np.expand_dims(np.linalg.norm(v1, axis=1), axis=1)
    v2_unit = v2/np.expand_dims(np.linalg.norm(v2, axis=1), axis=1)
    angle = np.arccos(np.sum(v1_unit*v2_unit, axis=1)) # dot multiplication
    return angle

def get_all_joint_angles(x_data):
    # joints
    # left shoulder     ->  left elbow      ->  left wrist      (8->9->10)
    x_data["angle_LS_LE_LW"] = joint_angle(x_data, "LS", "LE", "LW")
    # right shoulder    ->  right elbow     ->  right wrist     (4->6->7)
    x_data["angle_RS_RE_RW"] = joint_angle(x_data, "RS", "RE", "RW")
    # right shoulder    ->  left shoulder   ->  front head      (4->8->1)
    x_data["angle_RS_LS_FH"] = joint_angle(x_data, "RS", "LS", "FH")
    # right shoulder    ->  left shoulder   ->  left elbow      (4->8->9)
    x_data["angle_RS_LS_LE"] = joint_angle(x_data, "RS", "LS", "LE")
    # left shoulder     ->  right shoulder  ->  right elbow     (8->4->6)
    x_data["angle_LS_RS_RE"] = joint_angle(x_data, "LS", "RS", "RE")
    # v sacral          ->  right offset    ->  rear head       (13->5->3)
    x_data["angle_VS_RO_RH"] = joint_angle(x_data, "VS", "RO", "RH")
    # vsacral           ->  top head        ->  front head      (13->2->1)
    x_data["angle_VS_TH_FH"] = joint_angle(x_data, "VS", "TH", "FH")
    # v sacral          ->  left shoulder   ->  left elbow      (13->8->9)
    x_data["angle_VS_LS_LE"] = joint_angle(x_data, "VS", "LS", "LE")
    # v sacral          ->  right shoulder  ->  right elbow     (13->4->6)
    x_data["angle_VS_RS_RE"] = joint_angle(x_data, "VS", "RS", "RE")
    # left asis         ->  left shoulder   ->  left elbow      (12->8->9)
    x_data["angle_LA_LS_LE"] = joint_angle(x_data, "LA", "LS", "LE")
    # right asis        -> right shoulder   ->  right elbow     (11->4->6)
    x_data["angle_RA_RS_RE"] = joint_angle(x_data, "RA", "RS", "RE")
    return x_data

In [8]:
def get_all_joint_angles_reduced(x_data):
    # joints
    x_data["angle_LS_LE_LW"] = joint_angle(x_data, "LS", "LE", "LW")
    x_data["angle_RS_RE_RW"] = joint_angle(x_data, "RS", "RE", "RW")
    x_data["angle_RS_LS_LE"] = joint_angle(x_data, "RS", "LS", "LE")
    x_data["angle_LS_RS_RE"] = joint_angle(x_data, "LS", "RS", "RE")
    x_data["angle_VS_RO_RH"] = joint_angle(x_data, "VS", "RO", "RH")
    return x_data

In [9]:
def get_all_joint_angles_diff_reduced(x_data):
    # joints
    x_data["ang_dif_LS_LE_LW"] = np.nan_to_num(np.append(np.diff(joint_angle(x_data, "LS", "LE", "LW")), 0)).tolist()
    x_data["ang_dif_RS_RE_RW"] = np.nan_to_num(np.append(np.diff(joint_angle(x_data, "RS", "RE", "RW")), 0)).tolist()
    x_data["ang_dif_RS_LS_LE"] = np.nan_to_num(np.append(np.diff(joint_angle(x_data, "RS", "LS", "LE")), 0)).tolist()
    x_data["ang_dif_LS_RS_RE"] = np.nan_to_num(np.append(np.diff(joint_angle(x_data, "LS", "RS", "RE")), 0)).tolist()
    x_data["ang_dif_VS_RO_RH"] = np.nan_to_num(np.append(np.diff(joint_angle(x_data, "VS", "RO", "RH")), 0)).tolist()
    return x_data

In [10]:
def get_all_joint_angles_diff_red(x_data):
    # joints
    x_data["ang_dif_LS_LE_LW"] = np.nan_to_num(np.append(np.diff(joint_angle(x_data, "LS", "LE", "LW")), 0)).tolist()
    x_data["ang_dif_RS_RE_RW"] = np.nan_to_num(np.append(np.diff(joint_angle(x_data, "RS", "RE", "RW")), 0)).tolist()
    x_data["ang_dif_RS_LS_LE"] = np.nan_to_num(np.append(np.diff(joint_angle(x_data, "RS", "LS", "LE")), 0)).tolist()
    x_data["ang_dif_LS_RS_RE"] = np.nan_to_num(np.append(np.diff(joint_angle(x_data, "LS", "RS", "RE")), 0)).tolist()
    x_data["ang_dif_VS_RO_RH"] = np.nan_to_num(np.append(np.diff(joint_angle(x_data, "VS", "RO", "RH")), 0)).tolist()
    return x_data

# Pre-processed functions (add/remove streams/features here)

In [11]:
def current_best():
    def get_streams(x_data):
        speed, acc = get_speed_acc(x_data)
        x_data = pd.concat([x_data, speed, acc], axis=1)
        # speed, acc, jerk = get_speed_acc_jerk(x_data)
        # x_data = pd.concat([x_data, speed, acc, jerk], axis=1)
        x_data = get_all_joint_distances(x_data)
        # x_data = get_all_joint_angles(x_data)
        # x_data = get_all_angles_with_plane(x_data)
        return x_data

    def get_features(x_data):
        features = []
        cols = x_data.columns.tolist()
        #Calculate features (STD, Average, Max, Min, Median, Variance) for each data columns X Y Z 
        for k in cols:
            features.append(x_data[k].std(ddof=0))
            features.append(np.max(x_data[k]))
            features.append(np.min(x_data[k]))
            features.append(np.median(x_data[k]))        
        return features
    joint_names = ['FH', 'TH', 'RH', 'RS', 'RO', 'RE', 'RW', 'LS', 'LE', 'LW', 'RA', 'LA', 'VS']
    pos_cols = [i+j for i in joint_names for j in ["_X", "_Y", "_Z"]]
    speed_cols = [f"{col}_speed" for col in pos_cols]
    acc_cols = [f"{col}_acc" for col in pos_cols]
    distance_cols = ['dist_FH_LS', 'dist_FH_RS', 'dist_LS_LW', 'dist_RS_RW', 'dist_VS_LE', 'dist_VS_RE', 
        'dist_VS_LW', 'dist_VS_RW','dist_VS_RH', 'dist_VS_TH', 'dist_LW_RW', 'dist_LA_LW', 'dist_RA_RW', 
        'dist_LW_TH', 'dist_RW_TH', 'dist_TH_LA']
    colnames = pos_cols + speed_cols + acc_cols + distance_cols
    return get_streams, get_features, colnames

In [12]:
def remove_acc():
    def get_streams(x_data):
        speed, acc = get_speed_acc(x_data)
        x_data = pd.concat([x_data, speed], axis=1)
        x_data = get_all_joint_distances(x_data)
        return x_data

    def get_features(x_data):
        features = []
        cols = x_data.columns.tolist()
        for k in cols:
            features.append(x_data[k].std(ddof=0))
            features.append(np.max(x_data[k]))
            features.append(np.min(x_data[k]))
            features.append(np.median(x_data[k]))        
        return features
    joint_names = ['FH', 'TH', 'RH', 'RS', 'RO', 'RE', 'RW', 'LS', 'LE', 'LW', 'RA', 'LA', 'VS']
    pos_cols = [i+j for i in joint_names for j in ["_X", "_Y", "_Z"]]
    speed_cols = [f"{col}_speed" for col in pos_cols]
    distance_cols = ['dist_FH_LS', 'dist_FH_RS', 'dist_LS_LW', 'dist_RS_RW', 'dist_VS_LE', 'dist_VS_RE', 
        'dist_VS_LW', 'dist_VS_RW','dist_VS_RH', 'dist_VS_TH', 'dist_LW_RW', 'dist_LA_LW', 'dist_RA_RW', 
        'dist_LW_TH', 'dist_RW_TH', 'dist_TH_LA']
    colnames = pos_cols + speed_cols + distance_cols
    return get_streams, get_features, colnames

In [13]:
def angle_red_remove_acc():
    def get_streams(x_data):
        speed, acc = get_speed_acc(x_data)
        x_data = pd.concat([x_data, speed], axis=1)
        x_data = get_all_joint_distances(x_data)
        x_data = get_all_joint_angles_diff_red(x_data)
        return x_data

    def get_features(x_data):
        features = []
        cols = x_data.columns.tolist()
        for k in cols:
            features.append(x_data[k].std(ddof=0))
            features.append(np.max(x_data[k]))
            features.append(np.min(x_data[k]))
            features.append(np.median(x_data[k]))        
        return features
    joint_names = ['FH', 'TH', 'RH', 'RS', 'RO', 'RE', 'RW', 'LS', 'LE', 'LW', 'RA', 'LA', 'VS']
    pos_cols = [i+j for i in joint_names for j in ["_X", "_Y", "_Z"]]
    speed_cols = [f"{col}_speed" for col in pos_cols]
    distance_cols = ['dist_FH_LS', 'dist_FH_RS', 'dist_LS_LW', 'dist_RS_RW', 'dist_VS_LE', 'dist_VS_RE', 
        'dist_VS_LW', 'dist_VS_RW','dist_VS_RH', 'dist_VS_TH', 'dist_LW_RW', 'dist_LA_LW', 'dist_RA_RW', 
        'dist_LW_TH', 'dist_RW_TH', 'dist_TH_LA']
    angle_dif_cols = ["LSLELW", "RSRERW", "RSLSLE", "LSRSRE", "VSRORH"]
    colnames = pos_cols + speed_cols + distance_cols + angle_dif_cols
    return get_streams, get_features, colnames

# Model 1: RFC raw

In [29]:
get_streams, get_features, colnames = remove_acc()
overlap_rate, window_size = 0.8, 4000
X, y, file_lengths = get_processed_dataset(overlap_rate, window_size)

loading the data...	

100%|██████████| 151/151 [00:12<00:00, 12.40it/s]


extracting the features...  

100%|██████████| 512/512 [00:41<00:00, 12.48it/s]


In [30]:
model = RFC(2500, criterion="gini", n_jobs=-1)
scores = scaled_model_evaluator(model, X, y, file_lengths, n_repeats=10, voting=True, verbose=True)


100%|██████████| 10/10 [03:52<00:00, 23.21s/it]

Mean Score: 0.76429487594862
Std Score: 0.022429624437675363
Min Score: 0.7293233082706767
Max Score: 0.7973856209150327





# Model 2: RFC with angle

In [15]:
get_streams, get_features, colnames = angle_red_remove_acc()
overlap_rate, window_size = 0.8, 4000
X, y, file_lengths = get_processed_dataset(overlap_rate, window_size)

loading the data...	

100%|██████████| 151/151 [00:13<00:00, 11.35it/s]


extracting the features...  

100%|██████████| 512/512 [00:47<00:00, 10.73it/s]


In [19]:
model = RFC(3000, criterion="gini", n_jobs=-1)
scores, y_trues, y_preds = scaled_model_evaluator(model, X, y, file_lengths, n_repeats=10, return_pred=True, voting=True, verbose=True)


100%|██████████| 10/10 [04:14<00:00, 25.50s/it]

Mean Score: 0.7688451347003635
Std Score: 0.019963577319034663
Min Score: 0.706766917293233
Max Score: 0.8097345132743363





In [23]:
def plot_conf(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    df_cm = pd.DataFrame(cm, index = [i for i in range(1,11)],
                    columns = [i for i in range(1,11)])
    fig = plt.figure(figsize = (10,7))
    sns.heatmap(df_cm, annot=True)
    plt.xlabel("actual")
    plt.ylabel("pred")
    return fig 

In [28]:
cm = confusion_matrix(y_trues[25], y_preds[25])

In [30]:
cm.dump("plots/conf_matrix.pkl")

# Model 3: ETC

In [33]:
get_streams, get_features, colnames = remove_acc()
overlap_rate, window_size = 0.75, 4000
X, y, file_lengths = get_processed_dataset(overlap_rate, window_size)
ft_names = [i+j for i in colnames for j in ["_std", "_max", "_min", "_med"]]

loading the data...	

100%|██████████| 151/151 [00:15<00:00,  9.85it/s]


extracting the features...  

100%|██████████| 414/414 [00:31<00:00, 12.96it/s]


In [34]:
model = ETC(2400, criterion="entropy", max_depth=12, min_samples_split=4, n_jobs=-1)
scores = scaled_model_evaluator(model, X, y, file_lengths, n_repeats=10, voting=True, verbose=True)

100%|██████████| 10/10 [02:45<00:00, 16.54s/it]

Mean Score: 0.7512445423040776
Std Score: 0.021810399301761766
Min Score: 0.7102803738317757
Max Score: 0.7932960893854749





# Ensemble

In [40]:
def get_mode(file_pred):
    modes = mode(file_pred)
    majority_choice = modes.mode[0]
    return majority_choice

In [44]:
def model_ensembler(model1, model2, model3, X, y, file_lengths, n_repeats=10, voting=True, verbose=True):
    scores = []
    num_range = trange(n_repeats) if verbose else range(n_repeats)
    for _ in num_range:
        for p1, p2, p3 in [(1,2,3), (2,3,1), (3,1,2)]:
            X_test, y_test = X[p1], y[p1]
            X_train = X[p2] + X[p3]
            y_train = y[p2] + y[p3]
            scaler = MinMaxScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.fit_transform(X_test)
            # print(f"training model for person {p1}/3...", end="\t")
            
            model1.fit(X_train_scaled, y_train)
            pred1 = model1.predict(X_test_scaled)
            
            model2.fit(X_train_scaled, y_train)
            pred2 = model2.predict(X_test_scaled)

            model3.fit(X_train_scaled, y_train)
            pred3 = model3.predict(X_test_scaled)
            voting = True
            filtered_pred1 = majority_voting(pred1, file_lengths[p1])
            filtered_pred2 = majority_voting(pred2, file_lengths[p1])
            filtered_pred3 = majority_voting(pred3, file_lengths[p1])
            ensembled_pred = []
            for i in zip(pred1, pred2, pred3):
                ensembled_pred.append(get_mode(list(i)))
            scores.append(accuracy_score(y_test, ensembled_pred))
    if verbose:
        print(f"Mean Score: {np.mean(scores)}")
        print(f"Std Score: {np.std(scores)}")
        print(f"Min Score: {np.min(scores)}")
        print(f"Max Score: {np.max(scores)}")
    return scores

In [14]:
get_streams, get_features, colnames = angle_red_remove_acc()
overlap_rate, window_size = 0.8, 4000
X, y, file_lengths = get_processed_dataset(overlap_rate, window_size)

loading the data...	

100%|██████████| 151/151 [00:18<00:00,  8.25it/s]


extracting the features...  

100%|██████████| 512/512 [00:33<00:00, 15.11it/s]


In [22]:
model1 = RFC(2500, criterion="gini", n_jobs=-1)
model2 = RFC(3000, criterion="gini", n_jobs=-1)
model3 = ETC(2400, criterion="entropy", max_depth=12, min_samples_split=4, n_jobs=-1)

In [45]:
model_ensembler(model1, model2, model3, X, y, file_lengths, n_repeats=5, voting=True, verbose=True)

100%|██████████| 5/5 [05:35<00:00, 67.01s/it]

Mean Score: 0.7304229687527859
Std Score: 0.017621510972266344
Min Score: 0.6993464052287581
Max Score: 0.7593984962406015





[0.6993464052287581,
 0.7212389380530974,
 0.7443609022556391,
 0.7189542483660131,
 0.7212389380530974,
 0.7518796992481203,
 0.7058823529411765,
 0.7168141592920354,
 0.7518796992481203,
 0.7189542483660131,
 0.7345132743362832,
 0.7593984962406015,
 0.7254901960784313,
 0.7345132743362832,
 0.7518796992481203]

# Best Models for each of the classifiers

In [81]:
model1 = RFC(2500, criterion="gini", n_jobs=-1)
model2 = RFC(3000, criterion="gini", n_jobs=-1)
model3 = ETC(2400, criterion="entropy", max_depth=12, min_samples_split=4, n_jobs=-1)
model4 = xgb.XGBClassifier(colsample_bytree=1, max_depth = 12, subsample = 1, n_estimators=100, learning_rate = 0.3, objective = "multi:softmax", eval_metric = "mae")
model5 = lgb.LGBMClassifier(boosting_type="goss", learning_rate=0.1)
model6 = GaussianNB()
model7 = SVC(kernel="rbf", C=1.4, gamma=0.02)

In [80]:
scores = scaled_model_evaluator(model7, X, y, file_lengths, n_repeats=10, 
                            voting=True, verbose=True)

100%|██████████| 10/10 [00:03<00:00,  3.30it/s]

Mean Score: 0.7344697129485754
Std Score: 0.03438955689546655
Min Score: 0.6858407079646017
Max Score: 0.7593984962406015





In [83]:
scores = scaled_model_evaluator(GaussianNB(), X, y, file_lengths, n_repeats=10, voting=True, verbose=True)

100%|██████████| 10/10 [00:02<00:00,  4.20it/s]

Mean Score: 0.6414469425921795
Std Score: 0.057550718792237
Min Score: 0.5714285714285714
Max Score: 0.7123893805309734





In [84]:
cv_10_fold(model7, X, y, n_repeats=1, verbose=True)

100%|██████████| 1/1 [00:04<00:00,  4.63s/it]

Mean Score: 0.9241704374057316
Std Score: 0.060040882665894066
Min Score: 0.8076923076923077
Max Score: 1.0





[0.8076923076923077,
 0.8653846153846154,
 0.9411764705882353,
 1.0,
 1.0,
 0.9607843137254902,
 0.8627450980392157,
 0.9411764705882353,
 0.9607843137254902,
 0.9019607843137255]

In [16]:
y_true, y_pred = cv_10_fold_pred(model2, X, y, n_repeats=1, verbose=True)

100%|██████████| 1/1 [01:20<00:00, 80.49s/it]


In [20]:
y_true, y_pred = cv_10_fold_pred(model1, X, y, n_repeats=1, verbose=True)

100%|██████████| 1/1 [00:59<00:00, 59.49s/it]


In [21]:
accuracy_score(y_true, y_pred)

0.96484375

In [18]:
cm = confusion_matrix(y_true, y_pred)

In [19]:
cm.dump("../plots/cv_conf_matrix.pkl")