In [1]:
# Import required libraries
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt 
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
# import csv dataset as pandas dataframe
df = pd.read_csv("../data/encoded.csv", sep=',')

In [3]:
df1 = df.iloc[:, 1:60]
print(df1.head())

   goodDay1_coffee  goodDay1_food  goodDay1_media  goodDay1_other  \
0                0              0               0               0   
1                0              0               0               1   
2                0              1               0               0   
3                0              0               0               1   
4                0              0               0               0   

   goodDay1_productivity  goodDay1_relaxing  goodDay1_sex  goodDay1_sleep  \
0                      1                  0             0               0   
1                      0                  0             0               0   
2                      0                  0             0               0   
3                      0                  0             0               0   
4                      0                  0             0               1   

   goodDay1_social  goodDay1_sport    ...      choco_op_unknown  \
0                0               0    ...              

In [4]:
# import standard dataframe for the program column to use as Y variable
standard = pd.read_csv("../data/standard.csv", sep=',')

In [5]:
df1["program"]=standard.program
df1.head()

Unnamed: 0,goodDay1_coffee,goodDay1_food,goodDay1_media,goodDay1_other,goodDay1_productivity,goodDay1_relaxing,goodDay1_sex,goodDay1_sleep,goodDay1_social,goodDay1_sport,...,gender_female,gender_male,gender_unknown,stand_no,stand_unknown,stand_yes,bedtime,neighbors,randomNum,program
0,0,0,0,0,1,0,0,0,0,0,...,0,1,0,1,0,0,1.0,300.0,7.0,QRM
1,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,1,,100.0,394749.0,CS
2,0,1,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,23.0,5.0,6.0,BA
3,0,0,0,1,0,0,0,0,0,0,...,0,1,0,1,0,0,0.2,2.0,8.0,BA
4,0,0,0,0,0,0,0,1,0,0,...,0,1,0,1,0,0,0.0,6.0,8.0,BDE


In [13]:
# Get rid of the one hot encoded program columns (10 columns)
print(df1.shape)
without_program = df1.drop(df1.columns[22:32], axis=1)
print(without_program.shape)

(217, 59)
(217, 49)


In [14]:
# Some classifiers error out when NA is present.
clean = without_program.dropna()
len(clean)

203

In [15]:
# Assign data to X and y
X_data = clean.drop('program', axis=1)
y_data = clean.program

In [16]:
# Split the data into train and testing, must be stratified, especially since classes are inbalanced.
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, stratify=y_data, test_size=0.30)

In [17]:
# Turn bedtime into an actual number (float)
for time in X_train.bedtime:
    me = float(time)


In [19]:
'''
Search for the best parameters (max_features = # features to consider at each branch, n_estimators = # trees in the forest, cv = kfolds) 
using GridsearchCV
'''

parameters = {'max_features':range(1,49), 'n_estimators':range(5,20)}
clf = GridSearchCV(RandomForestClassifier(), parameters, n_jobs=4, cv=3)
clf.fit(X_train, y_train)
tree_model = clf.best_estimator_
print (clf.best_score_, clf.best_params_)

(0.40845070422535212, {'max_features': 15, 'n_estimators': 10})


In [20]:
# Create forest with the best parameters
forest = RandomForestClassifier(max_features=15, n_estimators=10, criterion='entropy')
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=15, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [21]:
# Test the accuracy of the random forest model on the test set.
y_predicted = forest.predict(X_test)
accuracy_score(y_test, y_predicted)

0.34426229508196721