In [1]:
import pandas as pd
import numpy as np

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV
from sklearn.metrics import confusion_matrix, cohen_kappa_score, make_scorer

In [3]:
cb22x = pd.read_feather("data/HarvardX_CB22x_2013_Spring.feather")

In [4]:
cb22x.describe()

Unnamed: 0,engaged,registered_before_launch,registered_after_launch,age,male
count,25093.0,25093.0,25093.0,25093.0,25093.0
mean,0.541147,24.709521,25.712908,30.990396,0.546646
std,0.498314,28.645622,43.479082,12.101414,0.497829
min,0.0,0.0,0.0,14.0,0.0
25%,0.0,0.0,0.0,23.0,0.0
50%,1.0,12.0,0.0,27.0,1.0
75%,1.0,48.0,38.0,36.0,1.0
max,1.0,84.0,176.0,79.0,1.0


In [5]:
cb22x_clean = pd.get_dummies(cb22x)

In [6]:
labels = np.array(cb22x_clean.engaged)

In [7]:
features = cb22x_clean.drop('engaged', axis = 1)
feature_list = list(features.columns)

In [8]:
features = np.array(features)

In [9]:
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, 
                                                                            train_size = 0.8, 
                                                                            random_state = 20130810)



In [10]:
print('Training Features Shape:', features_train.shape)
print('Training Labels Shape:', labels_train.shape)
print('Testing Features Shape:', features_test.shape)
print('Testing Labels Shape:', labels_test.shape)

Training Features Shape: (20074, 14)
Training Labels Shape: (20074,)
Testing Features Shape: (5019, 14)
Testing Labels Shape: (5019,)


In [11]:
rf = RandomForestClassifier(n_estimators = 1000,
                            max_features = 12,
                            oob_score = True,
                            random_state = 20130810,
                            n_jobs = 3)

In [12]:
rf.fit(features_train, labels_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=12, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=3,
            oob_score=True, random_state=20130810, verbose=0,
            warm_start=False)

In [13]:
rf.oob_score_

0.60112583441267309

In [53]:
grid_search = GridSearchCV(RandomForestClassifier(n_jobs = 3),
                           param_grid = {'max_features': [6, 8, 10, 12], 'n_estimators': [500]},
                           cv = RepeatedKFold(n_splits = 3, n_repeats = 10),
                           scoring = make_scorer(cohen_kappa_score))

In [None]:
%%time
grid_search.fit(features_train, labels_train)

In [32]:
grid_search.cv_results_['mean_test_score']

array([ 0.18663899,  0.18547474,  0.18875557,  0.19094673])