In [1]:
# https://scikit-learn.org/stable/auto_examples/svm/plot_oneclass.html
# https://scikit-learn.org/stable/modules/outlier_detection.html#outlier-detection

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from pprint import pprint
from scipy import signal

In [3]:
import matplotlib.pylab as pylab
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
params = {'legend.fontsize': 'x-large',
          'figure.figsize': (15, 5),
          'axes.labelsize': 'x-large',
          'axes.titlesize': 'x-large',
          'xtick.labelsize': 'x-large',
          'ytick.labelsize': 'x-large'}
pylab.rcParams.update(params)

## Preprocessing

In [7]:
basePath = '../../zju-gaitacc/'
LENGTH = 1000

In [9]:
def session_to_dataframe(session):
    df = pd.DataFrame(columns=range(3 * LENGTH + 1))
    
    i = 0
    for subj in Path(basePath + session).glob('subj_*'):
        for rec in subj.glob('*'):
            with open(rec.joinpath('3.txt')) as f:
                # extract the relevant range
                with open(rec.joinpath('useful.txt')) as use:
                    begin_rec, end_rec = use.readline().split(',')

                begin_rec = int(begin_rec)
                end_rec = int(end_rec)

                # read the content of the file
                lines = list(map(lambda line: [float(x) for x in line.strip().split(',')], f.readlines()))

                # select only relevant portion of the recording and resample it to fixed length
                lines = list(map(lambda x: signal.resample(x[begin_rec:end_rec], LENGTH), lines))

                # select label of measurement
                label = int(subj.name.split('_')[1])

                flat_list = [el for l in lines for el in l]
                flat_list.append(label)
                df.loc[i] = flat_list
                i += 1
                
    return df

In [70]:
RAW_PATH = 'data/raw/'

def raw_to_dataframe(session):
    def split_to_cycles(rec, lims):
        return [rec[lims[idx]:lims[idx + 1]] for idx in range(0, len(lims) - 2)]

    df = pd.DataFrame(columns=range(3 * 128 + 1))
    print(Path(RAW_PATH + session))
    print(list(Path(RAW_PATH + session).glob('subj_*')))
    
    i = 0
    for subj in Path(basePath + session).glob('subj_*'):
        print(subj)
        for rec in subj.glob('*'):
            with open(rec.joinpath('3.txt')) as f:
                # read the content of the file
                lines = list(map(lambda line: [float(x) for x in line.strip().split(',')], f.readlines()))

                # select only relevant portion of the recording and generate cycles
                cycles = range(128, len(lines[0]), 128)
                lines = list(map(lambda x: split_to_cycles(x, cycles), lines))

                # select label of measurement
                label = int(subj.name.split('_')[1])

                for cycle in range(len(lines[0])):
                    l = []
                    for line in range(len(lines)):
                        l.extend(lines[line][cycle])
                    l.append(label)
                    df.loc[i] = l
                    i += 1
                    
    return df

In [71]:
df_s0 = raw_to_dataframe('session_0')
df_s0.head()

data/raw/session_0
[]
../../zju-gaitacc/session_0/subj_010
../../zju-gaitacc/session_0/subj_012
../../zju-gaitacc/session_0/subj_008
../../zju-gaitacc/session_0/subj_006
../../zju-gaitacc/session_0/subj_019
../../zju-gaitacc/session_0/subj_014
../../zju-gaitacc/session_0/subj_004
../../zju-gaitacc/session_0/subj_009
../../zju-gaitacc/session_0/subj_020
../../zju-gaitacc/session_0/subj_017
../../zju-gaitacc/session_0/subj_011
../../zju-gaitacc/session_0/subj_005
../../zju-gaitacc/session_0/subj_007
../../zju-gaitacc/session_0/subj_018
../../zju-gaitacc/session_0/subj_016
../../zju-gaitacc/session_0/subj_002
../../zju-gaitacc/session_0/subj_015
../../zju-gaitacc/session_0/subj_001
../../zju-gaitacc/session_0/subj_022
../../zju-gaitacc/session_0/subj_003
../../zju-gaitacc/session_0/subj_013
../../zju-gaitacc/session_0/subj_021


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,375,376,377,378,379,380,381,382,383,384
0,0.2,0.24,0.24,0.32,0.4,0.4,0.36,0.32,0.32,0.24,...,-0.2,-0.2,-0.16,-0.08,-0.08,-0.08,-0.08,-0.04,-0.04,10.0
1,-0.04,-0.08,-0.12,-0.12,-0.12,-0.16,-0.16,-0.16,-0.16,-0.16,...,-0.16,-0.04,0.44,0.44,0.44,0.44,0.44,0.04,-0.16,10.0
2,0.08,0.12,0.16,0.16,0.16,0.12,0.08,0.04,-0.04,-0.12,...,-0.24,-0.24,-0.24,-0.28,-0.24,-0.24,-0.16,-0.16,-0.2,10.0
3,0.04,0.04,-0.28,-0.28,-0.32,0.0,0.0,-0.04,-0.04,-0.12,...,-0.04,-0.08,-0.12,-0.2,-0.2,-0.28,-0.28,-0.48,-0.52,10.0
4,0.0,0.08,0.08,0.08,0.0,-0.08,-0.2,-0.16,-0.04,-0.04,...,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.0,10.0


In [53]:
df_s0.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,375,376,377,378,379,380,381,382,383,384
count,1117.0,1117.0,1117.0,1117.0,1117.0,1117.0,1117.0,1117.0,1117.0,1117.0,...,1117.0,1117.0,1117.0,1117.0,1117.0,1117.0,1117.0,1117.0,1117.0,1117.0
mean,0.013648,0.012968,0.009466,0.007317,0.00732,0.00749,0.012825,0.011368,0.014023,0.012462,...,-0.131519,-0.132174,-0.134988,-0.132474,-0.129105,-0.126783,-0.129273,-0.129048,-0.13108,11.625783
std,0.263504,0.264223,0.268276,0.268199,0.266659,0.262062,0.260868,0.260832,0.256726,0.253932,...,0.240577,0.237431,0.23844,0.242895,0.243835,0.241672,0.241819,0.236346,0.236801,6.591483
min,-1.24,-1.16,-1.04,-1.28,-1.2,-1.2,-1.08,-1.12,-1.04,-1.08,...,-1.2,-1.0,-1.0,-1.04,-1.0,-1.0,-1.0,-0.96,-0.96,1.0
25%,-0.16,-0.16,-0.16,-0.16,-0.16,-0.16,-0.16,-0.125,-0.12,-0.125,...,-0.24,-0.25,-0.28,-0.25,-0.25,-0.24,-0.24,-0.24,-0.25,5.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.12,-0.12,-0.12,-0.12,-0.12,-0.12,-0.12,-0.12,-0.12,12.0
75%,0.16,0.16,0.16,0.16,0.16,0.16,0.16,0.16,0.16,0.16,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0
max,1.04,1.04,1.04,1.04,1.04,1.0,1.0,1.0,1.0,1.04,...,0.75,0.791667,0.75,0.75,0.708333,0.64,0.708333,0.75,0.875,22.0


In [10]:
df.to_csv('full_rec.csv', sep='\t', encoding='utf-8')

NameError: name 'df' is not defined

In [11]:
df_s0 = session_to_dataframe('session_0')
df_s0.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2991,2992,2993,2994,2995,2996,2997,2998,2999,3000


In [12]:
df_s0.shape

(0, 3001)

In [None]:
df_s1 = session_to_dataframe('session_1')
df_s1.head()

In [None]:
df_s1.shape

In [None]:
df_s1.to_csv('full_session1.csv', sep='\t', encoding='utf-8')

## Data loading

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df_s1 = pd.read_csv('full_session1.csv', sep='\t')

In [None]:
df_s1.shape

In [None]:
df_s1.head()

In [None]:
select_classes = range(1, 21)
# df = pd.concat([df_s1_128, df_s2_128])
df_s1 = df_s1.loc[df_s1[df_s1.columns[-1]].isin(select_classes)]
y = df_s1[df_s1.columns[-1]].values
df_s1.drop([df_s1.columns[-1]], axis=1, inplace=True)
X = df_s1.values

In [None]:
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True)

In [None]:
X_train.shape, X_test.shape

In [None]:
df_s0 = pd.read_csv('full_session0.csv', sep='\t')

In [None]:
y = df_s0[df_s0.columns[-1]].values
df_s0.drop([df_s0.columns[-1]], axis=1, inplace=True)
X = df_s0.values

In [None]:
s0_X_train, ss0_X_test, s0_y_train, s0_y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True)

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, init='pca', random_state=0)
X_2d = tsne.fit_transform(X)

In [None]:
import matplotlib.pylab as pylab
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.ticker import StrMethodFormatter
params = {'legend.fontsize': 'x-large',
          'figure.figsize': (15, 12),
          'axes.labelsize': 'x-large',
          'axes.titlesize': 'x-large',
          'xtick.labelsize': 'x-large',
          'ytick.labelsize': 'x-large'}
pylab.rcParams.update(params)

In [None]:
data = np.concatenate((X_2d, y.reshape(y.shape[0], 1)), axis=1)
viz_df = pd.DataFrame(data=data, columns=['x', 'y', 'class'])

In [None]:
import seaborn as sns
fg = sns.FacetGrid(data=viz_df, hue='class', hue_order=np.unique(y), aspect=1.5, height=10)
fg.map(plt.scatter, 'x', 'y').add_legend()

In [None]:
target_ids = np.unique(y)
colors = cm.rainbow(np.linspace(0, 1, len(target_ids)))
for i, c in zip(target_ids, colors):
    plt.scatter(X_2d[y == i, 0], X_2d[y == i, 1], label=i, color=c)
plt.legend()
plt.show()

## One-class SVM

In [None]:
from sklearn import svm

In [None]:
clf = svm.OneClassSVM(nu=0.2, kernel="rbf", gamma='auto')
clf.fit(X_train)

In [None]:
train_pred = clf.predict(X_train)

In [None]:
str(np.count_nonzero(train_pred == -1)) + '/' + str(len(X_train))

In [None]:
test_pred = clf.predict(X_test)

In [None]:
str(np.count_nonzero(test_pred == -1)) + '/' + str(len(X_test))

In [None]:
s0_pred = clf.predict(X)

In [None]:
str(np.count_nonzero(s0_pred == -1)) + '/' + str(len(X))

## USPS

In [None]:
import h5py

In [None]:
with h5py.File('../data/usps.h5', 'r') as hf:
    train = hf.get('train')
    X_train = train.get('data')[:]
    y_train = train.get('target')[:]
    test = hf.get('test')
    X_test = test.get('data')[:]
    y_test = test.get('target')[:]
    
X_train.shape, y_train.shape

In [None]:
X = np.concatenate((X_train, X_test), axis=0)

In [None]:
y = np.concatenate((y_train, y_test), axis=0)

In [None]:
clf = svm.OneClassSVM(nu=0.2, kernel="rbf", gamma='auto')
clf.fit(X_train)