In [8]:
import glob
import matplotlib
import pandas as pd
import numpy as np
import os
import matplotlib.pylab as plt
from matplotlib import cm
from sklearn.ensemble import RandomForestClassifier
from scipy.cluster.vq import vq, kmeans, whiten
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


matplotlib.style.use('ggplot')
%matplotlib inline

In [9]:
activities = {'Brush_teeth':0,
            'Climb_stairs': 1,
            'Comb_hair':2, 
            'Descend_stairs':3,
            'Drink_glass':4, 
            'Eat_meat':5,
            'Eat_soup':6,
            'Getup_bed':7,
            'Liedown_bed':8,
            'Pour_water':9,
            'Sitdown_chair':10,
            'Standup_chair':11,
            'Use_telephone':12, 
            'Walk':13
            }

# Separate test/train in file names only
X_files = [None]*len(activities)
X_train_files = [None]*len(activities)
X_test_files = [None]*len(activities)



def loadDataList():
    global X_files, activities
    for a in activities:
        f_list = glob.glob("./HMP_Dataset/" + a + "/*.txt")
        X_files[activities[a]] = f_list
        
def splitTrainTest():
    global X_train_files, X_test_files
    for i, item in enumerate(X_files):
        # not using y_train, ytest
        X_train_files[i], X_test_files[i], y_train, y_test = train_test_split(X_files[i], [i]*len(X_files[i]), test_size=0.2, random_state=10)

def getChunkedVectors(f):
    global chunk_size
    df_file = pd.read_table(f, sep=' ', header=None)
    df = pd.DataFrame()
    for l in range(0, len(df_file) - chunk_size, chunk_size):
        chunk = df_file.iloc[l:l+32].values.flatten()
        df = df.append([chunk])

    return df

def loadTrainingData():
    global X_train_all
    for cls, item in enumerate(X_train_files):
        df = pd.DataFrame()
        for f in X_train_files[cls]:
            X_train_all = X_train_all.append(getChunkedVectors(f))            

            
def computeQuantizedVectors():
    global VQ_Train, VQ_Test, codebook
    for cls, item in enumerate(X_train_files):
        for f in X_train_files[cls]:
            df = getChunkedVectors(f)
            code, dist = vq(df.astype('float'), codebook)
            hist, ed = np.histogram(code, k, density=True)
            VQ_Train  = VQ_Train.append([np.append(hist, cls)])

    for cls, item in enumerate(X_test_files):
        for f in X_test_files[cls]:
            df = getChunkedVectors(f)
            code, dist = vq(df.astype('float'), codebook)
            hist, ed = np.histogram(code, k, density=True)
            VQ_Test  = VQ_Test.append([np.append(hist, cls)])

            


In [10]:
chunk_size = 32
overlap = 30

# Dealing with file names only here
loadDataList()
splitTrainTest()

# Opening the files and loading all training data for kmeans
X_train_all = pd.DataFrame()

loadTrainingData()

# Running kmeans on training data to find the clusters
k = 480
codebook, distortion = kmeans(X_train_all.astype(float), k)

In [11]:
# find histogram for training data

VQ_Train = pd.DataFrame()
VQ_Test = pd.DataFrame()
            
computeQuantizedVectors()

display(VQ_Train.head())
display(VQ_Test.head())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,471,472,473,474,475,476,477,478,479,480
0,0.201734,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025217,0.0
0,0.14652,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.19536,0.0
0,0.068289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011381,0.0
0,0.013164,0.0,0.0,0.0,0.0,0.013164,0.0,0.0,0.0,0.013164,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013164,0.013164,0.0
0,0.014548,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.014548,0.0,0.043644,0.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,471,472,473,474,475,476,477,478,479,480
0,0.011467,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022934,0.0
0,0.017866,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.035732,0.0,0.0,0.0,0.0,0.0,0.0,0.017866,0.0
0,0.010765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021529,0.0
0,0.090498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090498,1.0
0,0.085653,0.085653,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085653,1.0


In [12]:
# Fit to a classifier and check accuracy

clf = RandomForestClassifier(max_depth=10, random_state=0)
clf.fit(VQ_Train.drop(k, axis=1), VQ_Train[[k]].values.flatten())

predicted = clf.predict(VQ_Test.drop(k, axis=1))
accuracy_score(VQ_Test[[k]].values.flatten(), predicted)

0.26011560693641617