In [18]:
import glob
import matplotlib
import pandas as pd
import numpy as np
import os
import matplotlib.pylab as plt
from matplotlib import cm
from sklearn.ensemble import RandomForestClassifier
from scipy.cluster.vq import vq, kmeans, whiten
from sklearn.model_selection import train_test_split


matplotlib.style.use('ggplot')
%matplotlib inline

In [47]:
activities = {'Brush_teeth':0,
            'Climb_stairs': 1,
            'Comb_hair':2, 
            'Descend_stairs':3,
            'Drink_glass':4, 
            'Eat_meat':5,
            'Eat_soup':6,
            'Getup_bed':7,
            'Liedown_bed':8,
            'Pour_water':9,
            'Sitdown_chair':10,
            'Standup_chair':11,
            'Use_telephone':12, 
            'Walk':13
            }

X_files = [None]*len(activities)
X_train_files = [None]*len(activities)
X_test_files = [None]*len(activities)

X_train_all = pd.DataFrame()
X_train = [None]*len(activities)
X_test = [None]*len(activities)

chunk_size = 32
k = 480

def loadDataList():
    global X_files, activities
    for a in activities:
        f_list = glob.glob("./HMP_Dataset/" + a + "/*.txt")
        X_files[activities[a]] = f_list
        
def splitTrainTest():
    global X_train_files, X_test_files
    for i, item in enumerate(X_files):
        # not using y_train, ytest
        X_train_files[i], X_test_files[i], y_train, y_test = train_test_split(X_files[i], [i]*len(X_files[i]), test_size=0.2, random_state=10)

def getChunkedVectors(f):
    global chunk_size
    df_file = pd.read_table(f, sep=' ', header=None)
    df = pd.DataFrame()
    for l in range(0, len(df_file) - chunk_size, chunk_size):
        chunk = df_file.iloc[l:l+32].values.flatten()
        df = df.append([chunk])

    return df

def loadTrainingData():
    global X_train_all
    for cls, item in enumerate(X_train_files):
        df = pd.DataFrame()
        for f in X_train_files[cls]:
            X_train_all = X_train_all.append(getChunkedVectors(f))            
        
loadDataList()
splitTrainTest()
loadTrainingData()


In [49]:
codebook, distortion = kmeans(X_train_all.astype(float), 480)

In [69]:
# find histogram for training data

VQ_Train = pd.DataFrame()
VQ_Test = pd.DataFrame()

def computeQuantizedVectors():
    global VQ_Train, VQ_Test, codebook
    for cls, item in enumerate(X_train_files):
        for f in X_train_files[cls]:
            df = getChunkedVectors(f)
            code, dist = vq(df.astype('float'), codebook)
            hist, ed = np.histogram(code, 480)
            VQ_Train  = VQ_Train.append([np.append(hist, cls)])

    for cls, item in enumerate(X_test_files):
        for f in X_test_files[cls]:
            df = getChunkedVectors(f)
            code, dist = vq(df.astype('float'), codebook)
            hist, ed = np.histogram(code, 480)
            VQ_Test  = VQ_Test.append([np.append(hist, cls)])

            
computeQuantizedVectors()

In [71]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(VQ_Train.drop(480, axis=1), VQ_Train[[480]].values.flatten())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [74]:
from sklearn.metrics import accuracy_score

predicted = clf.predict(VQ_Test.drop(480, axis=1))
accuracy_score(VQ_Test[[480]].values.flatten(), predicted)

0.17341040462427745