In [1]:
import pandas as pd
import numpy as np
import os
from utils import k_eros

In [2]:
features = [
    'sx_x',
    'sx_y',
    'sx_z',
    'sx_roll',
    'sx_pitch',
    'sx_yaw',
    'sx_thumb',
    'sx_forefinger',
    'sx_middle_finger',
    'sx_ring_finger',
    'sx_little_finger',
    'dx_x',
    'dx_y',
    'dx_z',
    'dx_roll',
    'dx_pitch',
    'dx_yaw',
    'dx_thumb',
    'dx_forefinger',
    'dx_middle_finger',
    'dx_ring_finger',
    'dx_little_finger'
    ]

In [3]:
#path = "./tctodd/"
path = "../../Desktop/MML Project/tctodd/"
dirs = os.listdir(path=path)
weeks = sorted([i for i in dirs if i != ".DS_Store"])
filenames = sorted(os.listdir(path+weeks[1]))

data = []
labels = dict()
label_cnt = 0

for w in weeks:
    temp_path = path+w+"/"
    filenames = sorted(os.listdir(temp_path))
    for fn in filenames:
        label = fn.split('.')[0][:-2]
        
        if label not in labels:
            labels[label] = label_cnt
            label_cnt += 1
            
        data.append({'label':labels[label], 'time_series':pd.read_csv(temp_path+fn, header=None, sep='\t',).values})
        

In [4]:
df = pd.DataFrame(data, columns=['label', 'time_series'])
print(df['label'].value_counts())
df.head()

0     27
60    27
69    27
68    27
67    27
      ..
29    27
28    27
27    27
26    27
94    27
Name: label, Length: 95, dtype: int64


Unnamed: 0,label,time_series
0,0,"[[-0.064909, 0.034318, -0.043964, 0.626383, 0...."
1,0,"[[-0.107059, -0.126109, -0.053742, 0.612516, 0..."
2,0,"[[-0.061427, -0.082576, -0.102991, 0.735469, 0..."
3,1,"[[-0.128178, 0.02695, -0.050126, 0.455028, 0.4..."
4,1,"[[-0.143672, -0.144416, -0.047447, 0.660979, 0..."


In [5]:
from sklearn.model_selection import train_test_split
X = df['time_series']
y = df['label']
seed = 0
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.222222222222222, random_state=seed) # to have 21 and 6 examples in, respectively, train and test set
X_train, X_test = X_train.reset_index(drop=True),X_test.reset_index(drop=True)
y_train, y_test = y_train.reset_index(drop=True), y_test.reset_index(drop=True)

In [6]:
y_train.value_counts()

19    21
23    21
8     21
61    21
44    21
      ..
53    21
9     21
64    21
93    21
94    21
Name: label, Length: 95, dtype: int64

In [7]:
import sklearn as sk
n_folds = 5
skf = sk.model_selection.StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)

## SVC

In [8]:
import sklearn as sk
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import itertools
from tqdm import tqdm

seed = 0
n_pcs = 54

##LINEAR KERNEL
params = [['linear'], [0.0001, 0.0005, 0.001, 0.005, 0.01, 1]]
params_comb = list(itertools.product(*params))

##RBF KERNEL
params = [['rbf'], [0.0001, 0.0005, 0.001, 0.005, 0.01, 1], [0.001, 0.01, 1, 2, 5, 10]]
params_comb += list(itertools.product(*params))

##POLYNOMIAL KERNEL
params = [['poly'], [1e-5, 0.0001, 0.0005, 0.001, 0.005, 0.01, 1], [0.001, 0.01, 1, 2, 5, 10], [3, 6, 10, 15, 20, 23, 25, 30], [0, 0.1, 0.5, 1, 5, 10]]
params_comb += list(itertools.product(*params))

In [9]:
acc_scores = np.zeros(len(params_comb))
for train_index, val_index in skf.split(X_train, y_train):
    X_train_cv = X_train.iloc[train_index]
    y_train_cv = y_train.iloc[train_index]
    X_val_cv = X_train.iloc[val_index]
    y_val_cv = y_train.iloc[val_index]
    X_train_matrix = np.vstack(X_train_cv)
    means_train = np.mean(X_train_matrix, axis=0)
    stds_train = np.std(X_train_matrix, axis=0)

    S, v_list_train = k_eros.compute_S_matrix(X_train_cv, means_train, stds_train)
    _, v_list_test = k_eros.compute_S_matrix(X_val_cv, means_train, stds_train)
    w = k_eros.compute_weight_vector(S, algorithm=2)
    K_eros_train_mc, V, _ = k_eros.perform_PCA(len(X_train_cv), weight_vector=w, v_list=v_list_train)
    Y, _ = k_eros.project_test_data(len(X_train_cv), len(X_val_cv), w, v_list_train, v_list_test, K_eros_train_mc, V)
    princ_components = V[:, :n_pcs]
    test_princ_components = Y[:, :n_pcs]
    for i, params in enumerate(tqdm(params_comb, desc='doing parameters search...')):
        if len(params) == 2:
            combination = tuple([params[0], params[1], 1, 3, 0])
        elif len(params) == 3:
            combination = tuple([params[0], params[1], params[2], 3, 0])
        else:
            combination = params
        svc = SVC(kernel=combination[0], C=combination[1], gamma=combination[2], degree=combination[3], coef0=combination[4], max_iter=10000)
        
        svc.fit(princ_components, y_train_cv.values)
        
        predictions = svc.predict(test_princ_components)
        res = accuracy_score(y_val_cv.values, predictions)
        acc_scores[i] += res
acc_scores = acc_scores / n_folds
best_idx = np.argmax(acc_scores)    
print(f'Found best combination! {params_comb[best_idx]} w. accuracy of {acc_scores[best_idx]}.')
best_comb = params_comb[best_idx]

doing parameters search...: 100%|██████████| 2058/2058 [14:20<00:00,  2.39it/s]
doing parameters search...: 100%|██████████| 2058/2058 [13:46<00:00,  2.49it/s]
doing parameters search...: 100%|██████████| 2058/2058 [13:05<00:00,  2.62it/s]
doing parameters search...: 100%|██████████| 2058/2058 [13:10<00:00,  2.60it/s]
doing parameters search...: 100%|██████████| 2058/2058 [12:45<00:00,  2.69it/s]

Found best combination! ('poly', 0.005, 5, 30, 1) w. accuracy of 0.9488721804511278.





In [10]:
n_princ_cs = [1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 54]
res_per_pc = np.zeros(len(n_princ_cs))

X_train_matrix = np.vstack(X_train)
means_train = np.mean(X_train_matrix, axis=0)
stds_train = np.std(X_train_matrix, axis=0)

S, v_list_train = k_eros.compute_S_matrix(X_train, means_train, stds_train)
_, v_list_test = k_eros.compute_S_matrix(X_test, means_train, stds_train)
w = k_eros.compute_weight_vector(S, algorithm=2)
K_eros_train_mc, V, eig_vals = k_eros.perform_PCA(len(X_train), weight_vector=w, v_list=v_list_train)
Y, K_eros_test_mc = k_eros.project_test_data(len(X_train), len(X_test), w, v_list_train, v_list_test, K_eros_train_mc, V)
svc = SVC(kernel=best_comb[0], C=best_comb[1], gamma=best_comb[2], degree=best_comb[3], coef0=best_comb[4])
for i, n_pc in enumerate(n_princ_cs):
    princ_components = V[:, :n_pc]
    svc.fit(princ_components, y_train.values)
    test_princ_components = Y[:, :n_pc]
    predictions = svc.predict(test_princ_components)
    res = accuracy_score(y_test.values, predictions)
    res_per_pc[i] = res
print(f'Reached an accuracy of {res_per_pc[-1]}.')
svc_res_per_pc = res_per_pc
print(f"svc_res_per_pc: {svc_res_per_pc}")

Reached an accuracy of 0.9578947368421052.
svc_res_per_pc: [0.02105263 0.33333333 0.6877193  0.86140351 0.88947368 0.93508772
 0.94035088 0.94736842 0.96315789 0.95438596 0.95789474 0.95789474]


## Random Forest Classifier

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid
import itertools
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import KernelPCA
from tqdm import tqdm
criterion = ['gini', 'entropy']
n_estimators = [10, 25, 50, 100, 200]
max_depth = [10, 20, 30, 50, 100]
min_samples_split = [2, 6, 10] # minimum sample number to split a node
min_samples_leaf = [1, 3, 4] # minimum sample number that can be stored in a leaf node
params_grid = {
    'criterion': criterion,
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
}
n_pcs = 54
n_coords = 60
params_list = list(ParameterGrid(params_grid))
acc_scores = np.zeros(len(params_list))
for train_index, val_index in skf.split(X_train, y_train):
    X_train_cv = X_train.iloc[train_index]
    y_train_cv = y_train.iloc[train_index]
    X_val_cv = X_train.iloc[val_index]
    y_val_cv = y_train.iloc[val_index]
    X_train_matrix = np.vstack(X_train_cv)
    means_train = np.mean(X_train_matrix, axis=0)
    stds_train = np.std(X_train_matrix, axis=0)
    S, v_list_train = k_eros.compute_S_matrix(X_train_cv, means_train, stds_train)
    _, v_list_test = k_eros.compute_S_matrix(X_val_cv, means_train, stds_train)
    w = k_eros.compute_weight_vector(S, algorithm=2)
    K_eros_train_mc, V, eig_vals = k_eros.perform_PCA(len(X_train_cv), weight_vector=w, v_list=v_list_train)
    Y, K_eros_test_mc = k_eros.project_test_data(len(X_train_cv), len(X_val_cv), w, v_list_train, v_list_test, K_eros_train_mc, V)
    princ_components = V[:, :n_pcs]
    test_princ_components = Y[:, :n_pcs]
    for i, params in enumerate(tqdm(params_list)):
        rf = RandomForestClassifier(criterion = params['criterion'], n_estimators=params['n_estimators'], max_depth=params['max_depth'], min_samples_split=params['min_samples_split'], min_samples_leaf=params['min_samples_leaf'])
        rf.fit(princ_components, y_train_cv.values)
        predictions = rf.predict(test_princ_components)
        res = accuracy_score(y_val_cv.values, predictions)
        acc_scores[i] += res
acc_scores = acc_scores / n_folds
best_idx = np.argmax(acc_scores)    
print(f'Found best combination! {params_list[best_idx]} w. accuracy of {acc_scores[best_idx]}.')
best_comb = params_list[best_idx]

100%|██████████| 450/450 [33:01<00:00,  4.40s/it]
100%|██████████| 450/450 [31:34<00:00,  4.21s/it]
100%|██████████| 450/450 [31:00<00:00,  4.13s/it]
100%|██████████| 450/450 [31:22<00:00,  4.18s/it]
100%|██████████| 450/450 [31:20<00:00,  4.18s/it]

Found best combination! {'criterion': 'entropy', 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200} w. accuracy of 0.5157894736842106.





In [12]:
print(f'Found best combination! {params_list[best_idx]} w. accuracy of {acc_scores[best_idx]}.')
best_comb = params_list[best_idx]
params_list[best_idx]

Found best combination! {'criterion': 'entropy', 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200} w. accuracy of 0.5157894736842106.


{'criterion': 'entropy',
 'max_depth': 30,
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'n_estimators': 200}

In [13]:
#for the best params combination il valore dell accuracy per 10Kfold
n_princ_cs = [1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55]
res_per_pc = np.zeros(len(n_princ_cs))
X_train_matrix = np.vstack(X_train)
means_train = np.mean(X_train_matrix, axis=0)
stds_train = np.std(X_train_matrix, axis=0)

S, v_list_train = k_eros.compute_S_matrix(X_train, means_train, stds_train)
_, v_list_test = k_eros.compute_S_matrix(X_test, means_train, stds_train)
w = k_eros.compute_weight_vector(S, algorithm=2)
K_eros_train_mc, V, eig_vals = k_eros.perform_PCA(len(X_train), weight_vector=w, v_list=v_list_train)
Y, K_eros_test_mc = k_eros.project_test_data(len(X_train), len(X_test), w, v_list_train, v_list_test, K_eros_train_mc, V)
rf = RandomForestClassifier(**best_comb)
for i, n_pc in enumerate(n_princ_cs):
    princ_components = V[:, :n_pc]
    rf.fit(princ_components, y_train.values)
    test_princ_components = Y[:, :n_pc]
    predictions = rf.predict(test_princ_components)
    res = accuracy_score(y_test.values, predictions)
    res_per_pc[i] += res

print(f'Reached an accuracy of {res_per_pc[-1]}.')
rf_res_per_pc = res_per_pc
print(f'rf_res_per_pc: {rf_res_per_pc}')

Reached an accuracy of 0.49122807017543857.
rf_res_per_pc: [0.02105263 0.14736842 0.26140351 0.41578947 0.4122807  0.51052632
 0.48421053 0.48421053 0.51052632 0.50877193 0.47192982 0.49122807]


In [14]:
print('SVC accuracy x n principal components: ', svc_res_per_pc)
print('RF accuracy x n principal components: ', rf_res_per_pc)

SVC accuracy x n principal components:  [0.02105263 0.33333333 0.6877193  0.86140351 0.88947368 0.93508772
 0.94035088 0.94736842 0.96315789 0.95438596 0.95789474 0.95789474]
RF accuracy x n principal components:  [0.02105263 0.14736842 0.26140351 0.41578947 0.4122807  0.51052632
 0.48421053 0.48421053 0.51052632 0.50877193 0.47192982 0.49122807]


In [16]:
len(X_test)

570