# K-Nearest Neighbors with Dynamic Time Warping distance metric

In [1]:
import pandas as pd
import numpy as np
import os
from utils import base

## Read and create the dataset

In [2]:
#path = "./tctodd/"
path = "../../Desktop/MML Project/tctodd/"
dirs = os.listdir(path=path)
weeks = sorted([i for i in dirs if i != ".DS_Store"])
filenames = sorted(os.listdir(path+weeks[1]))

data = []
labels = dict()
label_cnt = 0

for w in weeks:
    temp_path = path+w+"/"
    filenames = sorted(os.listdir(temp_path))
    for fn in filenames:
        label = fn.split('.')[0][:-2]
        
        if label not in labels:
            labels[label] = label_cnt
            label_cnt += 1
            
        data.append({'label':labels[label], 'time_series':pd.read_csv(temp_path+fn, header=None, sep='\t',).values})
        

In [3]:
from sklearn.model_selection import train_test_split
from tslearn.utils import to_time_series_dataset

df = pd.DataFrame(data, columns=['label', 'time_series'])
X = df['time_series']
y = df['label']
X = to_time_series_dataset(X)
seed = 0
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.222222222222222, random_state=seed)


## KNN with Dynamic Time Warping distance metric

In [4]:
from tslearn.neighbors import KNeighborsTimeSeriesClassifier
import sklearn as sk
from sklearn.metrics import accuracy_score
import itertools
from tqdm import tqdm
n_folds = 5
skf = sk.model_selection.StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
params = [[1, 3, 5], ['uniform', 'distance']]
params_comb = list(itertools.product(*params))
acc_scores = np.zeros(len(params_comb))

# 5-fold cross-validation
for train_index, val_index in skf.split(X_train, y_train):
    X_train_cv = X_train[train_index]
    y_train_cv = y_train.iloc[train_index]
    X_val_cv = X_train[val_index]
    y_val_cv = y_train.iloc[val_index]
    for i, params in enumerate(tqdm(params_comb, desc='params search...')):
        knn = KNeighborsTimeSeriesClassifier(n_neighbors=params[0], metric='dtw', weights=params[1])
        knn.fit(X_train_cv, y_train_cv)
        preds = knn.predict(X_val_cv)
        res = accuracy_score(y_val_cv, preds)
        acc_scores[i] += res

acc_scores = acc_scores / n_folds
best_idx = np.argmax(acc_scores)    
print(f'Found best combination! {params_comb[best_idx]} w. accuracy of {acc_scores[best_idx]}.')
best_comb = params_comb[best_idx]

params search...: 100%|██████████| 6/6 [1:22:07<00:00, 821.23s/it]
params search...: 100%|██████████| 6/6 [1:21:57<00:00, 819.60s/it]
params search...: 100%|██████████| 6/6 [1:36:54<00:00, 969.13s/it] 
params search...: 100%|██████████| 6/6 [1:37:36<00:00, 976.03s/it] 
params search...: 100%|██████████| 6/6 [1:25:34<00:00, 855.78s/it]

Found best combination! (1, 'uniform') w. accuracy of 0.7969924812030075.





In [5]:
knn = KNeighborsTimeSeriesClassifier(n_neighbors=best_comb[0], weights=best_comb[1], metric='dtw')
knn.fit(X_train, y_train)
preds = knn.predict(X_test)
res = accuracy_score(y_test, preds)
print(f'Reached an accuracy of {res}.')

Reached an accuracy of 0.7807017543859649.
