In [2]:
import numpy as np
import pandas as pd
from os import listdir
from sklearn.cluster import KMeans
from collections import Counter
from random import shuffle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from knndtw import KNN_DTW

In [3]:
folder_path = 'HMP_Dataset'
folder_names = ['Brush_teeth','Climb_stairs','Comb_hair','Descend_stairs','Drink_glass','Eat_meat','Eat_soup','Getup_bed','Liedown_bed','Pour_water','Sitdown_chair','Standup_chair','Use_telephone','Walk']

In [4]:
def get_files(path):
    return listdir(path)

In [5]:
def make_file_list(folder_names):
    ret_list = []
    for i, folder in enumerate(folder_names):
        for filename in get_files(folder_path + '/' + folder):
            ret_list.append((folder_path + '/' + folder + '/' + filename, i))
    return ret_list

In [6]:
def tts(file_list, train_size):
    X, y = zip(*file_list)
    return train_test_split(X, y, test_size=0.2)

In [7]:
def get_num_lines(filename):
    line_count = 0
    with open(filename) as f:
        for i in f:
            line_count += 1
    return line_count

In [8]:
def get_raw_features(filename, block_size, file_id):
    df = pd.read_table(filename, delim_whitespace=True, header=None)
    ret = np.empty((len(df) // block_size, block_size * 3))

    for i in range(len(df) // block_size):
        ret[i,:] = df.iloc[i*block_size:((i+1)*block_size), :].values.ravel()
        
    df_ret = pd.DataFrame(ret)
    df_ret['file_id'] = pd.Series([file_id for i in range(len(df_ret))])
    #df_ret['folder_id'] = pd.Series([folder_id for i in range(len(df_ret))])
    
    return df_ret.set_index('file_id')

In [9]:
def get_rawX(files, file_labels, block_size):
    raw_X = pd.DataFrame()

    labels = pd.DataFrame(index=range(len(files)), columns=['y'])

    for i, filename in enumerate(files):
        raw_X = raw_X.append(get_raw_features(filename, block_size, i))
        labels.loc[i] = file_labels[i]
        
    return raw_X, labels

In [64]:
def get_clustered_data(raw_X, alphabet_size, clf=None):

    if clf is None:
        clf = KMeans(n_clusters=alphabet_size)
        clf.fit(raw_X)
        idx = np.argsort(np.sum(clf.cluster_centers_, axis=1))
        clf.cluster_centers_ = np.take(clf.cluster_centers_, idx, axis=0)
    
    #print(np.sort(clf.cluster_centers_, axis=1))
    #print(clf.labels_[0:10])

    #print(clf.predict(raw_X)[0:10])
    
    raw_X['cluster'] = clf.predict(raw_X)

    grouped = raw_X.groupby('file_id')
    #df = pd.DataFrame(columns=[i for i in range(alphabet_size)], index=[i for i in range(len(grouped))])
    
    arr = []
    
    for g in grouped:
        arr.append(g[1].cluster.values)
        #arr = np.histogram(g[1].cluster, range=(0,alphabet_size-1), bins=alphabet_size)[0]
        #df.loc[g[0]] = np.array([float(i)/np.sum(arr) for i in arr])
    
    return arr, clf

In [11]:
def get_longest_arr(data):
    longest = []
    for l in data:
        if len(l) > len(longest):
            longest = l
    return longest

In [12]:
def create_ts_arr(arr, c):
    coeff = len(arr) / c
    return np.take(arr, (np.arange(c) * coeff).astype(int))

In [12]:
def create_timeseries_df(data, c):
    df = pd.DataFrame(index=range(len(data)), columns=range(c))

    for i,l in enumerate(data):
        df.loc[i] = create_ts_arr(l, c)
        
    return df

In [66]:
def run_classifier(classifier, block_size=32, alphabet_size=32):
    train_files, test_files, train_labels, test_labels = tts(make_file_list(folder_names), 0.8)
    
    rawX_train, train_y_labels = get_rawX(train_files, train_labels, block_size)
    train_X, train_clf = get_clustered_data(rawX_train, alphabet_size)
    
    c = len(get_longest_arr(train_X))
    #X_df = create_timeseries_df(train_X, c)
    
    classifier.fit(np.array(train_X), train_y_labels.values)
    
    rawX_test, test_y_labels = get_rawX(test_files, test_labels, block_size)
    test_X, _ = get_clustered_data(rawX_test, alphabet_size, clf=train_clf)
    #X_test_df = create_timeseries_df(test_X, c)
    #test_X_arr = np.array(test_X)
    
    y_predict = classifier.predict(np.array(test_X[0:10]))
    
    print(y_predict)
    print(test_y_labels.values[0:10].ravel())
    
    return np.equal(y_predict, test_y_labels.values[0:10].ravel()).astype(int).sum() / y_predict.shape[0]
    
    #classifier.fit(train_X, train_y['y'])
    
    #rawX_test, test_y_labels = get_rawX(test_files, test_labels, block_size)
    #test_X, test_y, test_clf = get_clustered_data(rawX_test, test_y_labels, alphabet_size, clf=train_clf)
    #return classifier.score(test_X, test_y)

In [68]:
timeit(print(run_classifier(KNN_DTW(), block_size=5, alphabet_size=15)))

[7 3]
[7 3]
1.0
[4 4]
[4 4]
1.0
[13  1]
[1 1]
0.5
[11  3]
[11  3]
1.0
[4 4]
[4 4]
1.0
[12  1]
[4 3]
0.0
[ 9 13]
[ 9 13]
1.0
[7 1]
[ 7 13]
0.5
1min 19s ± 18.1 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [68]:
a = np.array([3,2,1])
b = np.array([4,5,6,7,8])
create_ts_arr(a, len(b))

array([3, 3, 2, 2, 1])

In [15]:
y_predict

NameError: name 'y_predict' is not defined