In [2]:
# Load the modules and be done with them
import numpy as np
import pandas as pd
from time import time
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [3]:
#Load the data from the WISDM dataset
dataframe = pd.read_csv("/Users/gafergus/Knowhere/data/WISDM_ar_v1.1/WISDM_ar_v1_1_raw.csv", header=None, 
                       names = ["_id", "activity", "timestamp", "acceleration_x", "acceleration_y", "acceleration_z"])
dataframe = dataframe[["timestamp", "acceleration_x", "acceleration_y", "acceleration_z", "activity"]]
dataframe.set_index("timestamp")

Unnamed: 0_level_0,acceleration_x,acceleration_y,acceleration_z,activity
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
49105962326000,-0.694638,12.680544,0.503953,Jogging
49106062271000,5.012288,11.264028,0.953424,Jogging
49106112167000,4.903325,10.882658,-0.081722,Jogging
49106222305000,-0.612916,18.496431,3.023717,Jogging
49106332290000,-1.184970,12.108489,7.205164,Jogging
49106442306000,1.375655,-2.492524,-6.510526,Jogging
49106542312000,-0.612916,10.569390,5.706926,Jogging
49106652389000,-0.503953,13.947236,7.055340,Jogging
49106762313000,-8.430995,11.413852,5.134871,Jogging
49106872299000,0.953424,1.375655,1.648062,Jogging


In [4]:
# load dataset
dataset = dataframe.values
X = dataset[:,0:3].astype(float)
Y = dataset[:,4]
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# One Hot Encode
lb = LabelBinarizer()
lb.fit(encoded_Y)
dummy_y = lb.fit_transform(encoded_Y)

In [5]:
# build a classifier
RF_Class = RandomForestClassifier(n_estimators=100)
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
param_dist = {"max_depth": [2, 3, None],
              "max_features": sp_randint(1,3),
              "min_samples_split": sp_randint(2, 3),
              "min_samples_leaf": sp_randint(1, 3),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 10
random_search = RandomizedSearchCV(RF_Class, param_distributions=param_dist, n_iter=n_iter_search)

start = time()
random_search.fit(X, dummy_y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

RandomizedSearchCV took 14507.27 seconds for 10 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.360 (std: 0.024)
Parameters: {'bootstrap': False, 'min_samples_leaf': 2, 'min_samples_split': 2, 'criterion': 'gini', 'max_features': 1, 'max_depth': None}

Model with rank: 2
Mean validation score: 0.358 (std: 0.025)
Parameters: {'bootstrap': False, 'min_samples_leaf': 1, 'min_samples_split': 2, 'criterion': 'gini', 'max_features': 1, 'max_depth': None}

Model with rank: 3
Mean validation score: 0.356 (std: 0.020)
Parameters: {'bootstrap': True, 'min_samples_leaf': 1, 'min_samples_split': 2, 'criterion': 'entropy', 'max_features': 2, 'max_depth': None}



In [5]:
# CV the Random Firest Model with the best paramerters
Rand_Forest = RandomForestClassifier(n_estimators=1000, bootstrap = False,\
                                     min_samples_leaf = 2, min_samples_split = 2,\
                                     criterion ='gini', max_features=1, max_depth = None)
scores = cross_val_score(Rand_Forest, X, dummy_y, cv=3)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

KeyboardInterrupt: 