In [1]:
import pandas as pd
import numpy as np
import os
from utils import base

In [2]:
path = "./tctodd/"
#path = "../../Desktop/MML Project/tctodd/"
dirs = os.listdir(path=path)
weeks = sorted([i for i in dirs if i != ".DS_Store"])
filenames = sorted(os.listdir(path+weeks[1]))

data = []
labels = dict()
label_cnt = 0

for w in weeks:
    temp_path = path+w+"/"
    filenames = sorted(os.listdir(temp_path))
    for fn in filenames:
        label = fn.split('.')[0][:-2]
        
        if label not in labels:
            labels[label] = label_cnt
            label_cnt += 1
            
        data.append({'label':labels[label], 'time_series':pd.read_csv(temp_path+fn, header=None, sep='\t',).values})
        

In [3]:
from sklearn.model_selection import train_test_split
from tslearn.utils import to_time_series_dataset
df = pd.DataFrame(data, columns=['label', 'time_series'])
seed = 0
X = df['time_series']
y = df['label']
X = to_time_series_dataset(X)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.222222222222222, random_state=seed)

Install h5py to use hdf5 features: http://docs.h5py.org/
  warn(h5py_msg)


In [4]:
from tslearn.svm import TimeSeriesSVC
import sklearn as sk
from sklearn.metrics import accuracy_score
import itertools
from tqdm import tqdm

n_folds = 5
skf = sk.model_selection.StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
params = [[1e-4, 1e-2, 1], [1e-1, 1, 10]]
params_comb = list(itertools.product(*params))

acc_scores = np.zeros(len(params_comb))
for train_index, val_index in skf.split(X_train, y_train):
    X_train_cv = X_train[train_index]
    y_train_cv = y_train.iloc[train_index]
    X_val_cv = X_train[val_index]
    y_val_cv = y_train.iloc[val_index]
    for i, params in enumerate(tqdm(params_comb, desc='doing parameters search...')):
        clf = TimeSeriesSVC(C=params[0], gamma=params[1], kernel="gak", max_iter=1000)
        clf.fit(X_train_cv, y_train_cv)
        predictions = clf.predict(X_val_cv)
        res = accuracy_score(y_val_cv, predictions)
        acc_scores[i] += res
acc_scores = acc_scores / n_folds
best_idx = np.argmax(acc_scores)    
print(f'Found best combination! {params_comb[best_idx]} w. accuracy of {acc_scores[best_idx]}.')
best_comb = params_comb[best_idx]

doing parameters search...: 100%|██████████| 9/9 [3:11:05<00:00, 1273.89s/it]  
doing parameters search...: 100%|██████████| 9/9 [4:04:48<00:00, 1632.01s/it]  
doing parameters search...: 100%|██████████| 9/9 [4:05:51<00:00, 1639.10s/it]  
doing parameters search...: 100%|██████████| 9/9 [4:00:34<00:00, 1603.83s/it]  
doing parameters search...: 100%|██████████| 9/9 [4:07:08<00:00, 1647.62s/it]  

Found best combination! (1, 10) w. accuracy of 0.6962406015037594.





In [5]:
clf = TimeSeriesSVC(C=best_comb[0], gamma=best_comb[1], kernel="gak")
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
res = accuracy_score(y_test, predictions)
print(f'Reached an accuracy of {res}.')

Reached an accuracy of 0.6859649122807018.
