In [1]:
import glob
import matplotlib
import pandas as pd
import numpy as np
import os
import matplotlib.pylab as plt
from matplotlib import cm
from sklearn.ensemble import RandomForestClassifier
from scipy.cluster.vq import vq, kmeans, whiten
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


matplotlib.style.use('ggplot')
%matplotlib inline

In [2]:
activities = {'Brush_teeth':0,
            'Climb_stairs': 1,
            'Comb_hair':2, 
            'Descend_stairs':3,
            'Drink_glass':4, 
            'Eat_meat':5,
            'Eat_soup':6,
            'Getup_bed':7,
            'Liedown_bed':8,
            'Pour_water':9,
            'Sitdown_chair':10,
            'Standup_chair':11,
            'Use_telephone':12, 
            'Walk':13
            }

def loadDataList():
    global X_files, activities
    for a in activities:
        f_list = glob.glob("./HMP_Dataset/" + a + "/*.txt")
        X_files[activities[a]] = f_list
        
def splitTrainTest():
    global X_train_files, X_test_files
    for i, item in enumerate(X_files):
        # not using y_train, ytest
        X_train_files[i], X_test_files[i], y_train, y_test = train_test_split(X_files[i], [i]*len(X_files[i]), test_size=0.2, random_state=10)

def getChunkedVectors(f):
    global chunk_size, overlap
    df_file = pd.read_table(f, sep=' ', header=None)
    df = pd.DataFrame()
    index = 0
    while index <= len(df_file) - chunk_size:
        chunk = df_file.iloc[index: index + chunk_size].values.flatten()
        df = df.append([chunk])
        index = index + (chunk_size - overlap)
        
    return df

def loadTrainingData():
    global X_train_all
    for cls, item in enumerate(X_train_files):
        df = pd.DataFrame()
        for f in X_train_files[cls]:
            X_train_all = X_train_all.append(getChunkedVectors(f))            

            
def computeQuantizedVectors():
    global VQ_Train, VQ_Test, codebook
    for cls, item in enumerate(X_train_files):
        for f in X_train_files[cls]:
            df = getChunkedVectors(f)
            code, dist = vq(whiten(df.astype('float')), codebook)
            hist, ed = np.histogram(code, bins=np.arange(k+1), density=True)
            VQ_Train  = VQ_Train.append([np.append(hist, cls)])

    for cls, item in enumerate(X_test_files):
        for f in X_test_files[cls]:
            df = getChunkedVectors(f)
            code, dist = vq(whiten(df.astype('float')), codebook)
            hist, ed = np.histogram(code, bins=np.arange(k+1), density=True)
            VQ_Test  = VQ_Test.append([np.append(hist, cls)])

            


In [3]:
# Separate test/train in file names only
X_files = [None]*len(activities)
X_train_files = [None]*len(activities)
X_test_files = [None]*len(activities)


chunk_size = 32
overlap = 28

# Dealing with file names only here
loadDataList()
splitTrainTest()


In [4]:

# Opening the files and loading all training data for kmeans
X_train_all = pd.DataFrame()

loadTrainingData()
print("Loaded training data")


Loaded training data


In [5]:
# Running kmeans on training data to find the clusters
k = 480
from sklearn.cluster import MiniBatchKMeans

km = MiniBatchKMeans(n_clusters=k, random_state=10, batch_size=1000).fit(whiten(X_train_all.astype(float)))
print("Computed codebook")

Computed codebook


In [6]:
codebook = km.cluster_centers_

In [7]:
# find histogram for training data

VQ_Train = pd.DataFrame()
VQ_Test = pd.DataFrame()
            
computeQuantizedVectors()

display(VQ_Train.head())
display(VQ_Test.head())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,471,472,473,474,475,476,477,478,479,480
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002674,0.0,...,0.0,0.0,0.002674,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004902,...,0.02451,0.0,0.0,0.034314,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017677,0.0,...,0.0,0.0,0.0,0.025253,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.002911,0.001456,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.001621,0.009724,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,471,472,473,474,475,476,477,478,479,480
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008174,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.005618,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.005618,0.0,0.009363,0.007491,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.001282,0.0,0.0,0.0,0.007692,0.0,0.0,0.0,...,0.0,0.003846,0.002564,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [14]:
# Fit to a classifier and check accuracy

clf = RandomForestClassifier(max_depth=20)
clf.fit(VQ_Train.drop(k, axis=1), VQ_Train[[k]].values.flatten())

predicted = clf.predict(VQ_Test.drop(k, axis=1))
accuracy_score(VQ_Test[[k]].values.flatten(), predicted)

0.55491329479768781

In [16]:
from sklearn.metrics import confusion_matrix

confusion_matrix(VQ_Test[[k]].values.flatten(), predicted)

array([[ 3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 10,  0,  2,  0,  0,  0,  2,  0,  0,  2,  2,  0,  3],
       [ 0,  0,  4,  0,  2,  0,  0,  1,  0,  0,  0,  0,  0,  0],
       [ 0,  3,  0,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2],
       [ 0,  0,  0,  0, 17,  0,  0,  0,  0,  3,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  1,  0,  1,  0,  0, 10,  0,  2,  3,  4,  0,  0],
       [ 0,  1,  1,  0,  0,  0,  0,  1,  0,  0,  3,  0,  0,  0],
       [ 0,  3,  0,  0,  7,  0,  0,  0,  0, 10,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 13,  4,  0,  3],
       [ 0,  1,  0,  0,  0,  0,  0,  2,  0,  0,  3, 10,  0,  5],
       [ 0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  0,  1,  0],
       [ 0,  6,  0,  1,  0,  0,  0,  0,  0,  0,  1,  0,  0, 12]])