This notebook was used to investigate which features, how many estimators, and what maximum depth to use with a Random Forest model.

**Do not run this notebook**

It was run incrementally, over the course of multiple nights.  The results are all recorded in the Markdown cells.

In [None]:
%matplotlib inline
import crime
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
import itertools

In [None]:
train = crime.load_cleaned_train()
test = crime.load_cleaned_test()

In [None]:
X = train
y = X.pop('CategoryNumber')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, stratify=np.array(y))

In [None]:
def exhaustive_cross_validate(alg, X_train, X_test, y_train, y_test):
    columns = ['X', 'Y', 'Year', 'Month', 'Day', 'Hour', 'Minute',
           'BogusReport', 'NBogusReport', 'DoW', 'PdD', 'CornerCrime', 'ST_0']

    for i in range(4, 2, -1):
        p_set = (list(x) for x in itertools.combinations(columns, i))
        best = [100, None]
        for predictors in p_set:
            alg.fit(X_train[predictors], y_train)
            p = alg.predict_proba(X_test[predictors])
            ll = crime.logloss(y_test, p)
            if ll < best[0]:
                best = [ll, predictors]
        print best

In [None]:
alg = RandomForestClassifier(n_estimators=8, n_jobs=8)
exhaustive_cross_validate(alg, X_train, X_test, y_train, y_test)

[14.683728371472387, ['X', 'Y', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'BogusReport', 'NBogusReport', 'DoW', 'PdD', 'CornerCrime', 'ST_0']]

[14.610171072008116, ['X', 'Y', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'BogusReport', 'NBogusReport', 'DoW', 'PdD', 'CornerCrime']]

[14.613420958436391, ['X', 'Y', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'DoW', 'PdD', 'CornerCrime', 'ST_0']]

[14.649110639497309, ['X', 'Y', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'DoW', 'CornerCrime', 'ST_0']]

[13.843126828673501, ['Year', 'Month', 'Day', 'BogusReport', 'NBogusReport', 'DoW', 'PdD', 'CornerCrime', 'ST_0']]

[5.1849579381704656, ['Year', 'Month', 'Day', 'BogusReport', 'NBogusReport', 'DoW', 'CornerCrime', 'ST_0']]

[3.6906631934980179, ['Hour', 'BogusReport', 'NBogusReport', 'DoW', 'PdD', 'CornerCrime', 'ST_0']]

[2.6517715524983716, ['BogusReport', 'NBogusReport', 'DoW', 'PdD', 'CornerCrime', 'ST_0']]

[2.5402246673556275, ['Minute', 'BogusReport', 'NBogusReport', 'CornerCrime', 'ST_0']]





[2.5312469972587981, ['Minute', 'CornerCrime']]

[2.5699100091743325, ['Minute']]


In [None]:
def additive_cross_validate(alg, X_train, X_test, y_train, y_test):
    columns = ['X', 'Y', 'Year', 'Month', 'Day', 'Hour', 'Minute',
           'BogusReport', 'NBogusReport', 'DoW', 'PdD', 'CornerCrime', 'ST_0']
    n = len(columns)
    set_predictors = []
    for i in range(n):
        best = [100, None]
        for col in columns:
            predictors = set_predictors + [col]
            alg.fit(X_train[predictors], y_train)
            p = alg.predict_proba(X_test[predictors])
            ll = crime.logloss(y_test, p)
            if ll < best[0]:
                best = [ll, predictors]
        print best, '\n'
        set_predictors = best[1]
        columns = [col for col in columns if col not in set_predictors]

In [None]:
alg = RandomForestClassifier(n_estimators=30, max_depth=14, n_jobs=8)
additive_cross_validate(alg, X_train, X_test, y_train, y_test)

#### RandomForestClassifier(n_estimators=30, max_depth=14):

[2.5694585351830859, ['Minute']] 

[2.5084725184524168, ['Minute', 'Y']] 

[2.3957771209860232, ['Minute', 'Y', 'X']] 

[2.379239152053453, ['Minute', 'Y', 'X', 'CornerCrime']] 

[2.3548346757900438, ['Minute', 'Y', 'X', 'CornerCrime', 'Hour']] 

[2.3446859205183062, ['Minute', 'Y', 'X', 'CornerCrime', 'Hour', 'PdD']] 

[2.3333708291961717, ['Minute', 'Y', 'X', 'CornerCrime', 'Hour', 'PdD', 'Year']] 

[2.3327496065541422, ['Minute', 'Y', 'X', 'CornerCrime', 'Hour', 'PdD', 'Year', 'NBogusReport']] 

[2.3309676449756425, ['Minute', 'Y', 'X', 'CornerCrime', 'Hour', 'PdD', 'Year', 'NBogusReport', 'Month']] 

[2.3340906272822699, ['Minute', 'Y', 'X', 'CornerCrime', 'Hour', 'PdD', 'Year', 'NBogusReport', 'Month', 'DoW']] 

[2.3349194289989383, ['Minute', 'Y', 'X', 'CornerCrime', 'Hour', 'PdD', 'Year', 'NBogusReport', 'Month', 'DoW', 'ST_0']] 

[2.3394136779594907, ['Minute', 'Y', 'X', 'CornerCrime', 'Hour', 'PdD', 'Year', 'NBogusReport', 'Month', 'DoW', 'ST_0', 'BogusReport']] 

[2.3435247653544895, ['Minute', 'Y', 'X', 'CornerCrime', 'Hour', 'PdD', 'Year', 'NBogusReport', 'Month', 'DoW', 'ST_0', 'BogusReport', 'Day']]

#### RandomForestClassifier(n_estimators=20, max_depth=15):

[2.5690825704692752, ['Minute']]

[2.5292489143443384, ['Minute', 'CornerCrime']]

[2.5268526617100604, ['Minute', 'CornerCrime', 'NBogusReport']]

[2.5172761719846157, ['Minute', 'CornerCrime', 'NBogusReport', 'Y']]

[2.4244599303058969, ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X']]

[2.4010980676979257, ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year']]

[2.3834054870571264, ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour']]

[2.3721289532642253, ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD']]

[2.3698142139693941, ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day']]

[2.3664897323162504, ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month']]

[2.3623433321396212, ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0']]

[2.3634964565219558, ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0', 'BogusReport']]

[2.3645031723112102, ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0', 'BogusReport', 'DoW']]

In [None]:
def cross_validate(alg, X_train, X_test, y_train, y_test):
    p_set = [
        ['X', 'Y'],
        ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0'],
        ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour']
    ]

    for predictors in p_set:
        alg.fit(X_train[predictors], y_train)
        p = alg.predict_proba(X_test[predictors])
        print crime.logloss(y_test, p), predictors, '\n'

In [None]:
for n in [28, 29, 30]:
    print n, '\n'
    alg = RandomForestClassifier(n_estimators=n, max_depth=13, n_jobs=4)
    cross_validate(alg, X_train, X_test, y_train, y_test)

20 

2.44848346872 ['X', 'Y'] 

2.34785148667 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0'] 

2.34991088755 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour'] 

21 

2.44537664251 ['X', 'Y'] 

2.35109799565 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0'] 

2.34676763475 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour'] 

22 

2.44570101004 ['X', 'Y'] 

2.35041056097 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0'] 

2.34793560676 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour'] 

23 

2.44636156529 ['X', 'Y'] 

2.34731566701 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0'] 

2.34320440348 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour']

24 

2.44454637732 ['X', 'Y'] 

2.34725978651 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0'] 

2.34359640994 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour'] 

25 

2.44364731696 ['X', 'Y'] 

2.34443776624 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0'] 

2.34514239669 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour'] 

26 

2.44441802356 ['X', 'Y'] 

2.34730402363 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0'] 

2.34412783892 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour']

27 

2.44020118015 ['X', 'Y'] 

2.34142991405 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0'] 

2.34250960385 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour']

28 

2.441266849 ['X', 'Y'] 

2.34420711006 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0'] 

2.34049810957 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour'] 

29 

2.43915082078 ['X', 'Y'] 

2.34235084517 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0'] 

2.33960594791 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour'] 

30 

2.44131955025 ['X', 'Y'] 

2.34010719237 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0'] 

2.33856726002 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour']

In [None]:
for d in [15]:
    print d, '\n'
    alg = RandomForestClassifier(n_estimators=30, max_depth=d, n_jobs=4)
    cross_validate(alg, X_train, X_test, y_train, y_test)

#### 30 Trees

10

2.37787194338 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0'] 

2.37461847321 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour'] 

11 

2.36341281153 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0'] 

2.35790772453 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour']

12

2.35002368268 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0'] 

2.34811931089 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour'] 

13 

2.34095346239 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0'] 

2.33900676146 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour'] 

14

2.3371343343 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0'] 

2.33684938339 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour'] 

15 

2.34263518062 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0'] 

2.3455555096 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour']

16

2.35694158458 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0'] 

2.36181354039 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour'] 

17 

2.39686766601 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0'] 

2.41566946359 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour'] 

18

2.45479745439 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0'] 

2.49314013667 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour']

19 

2.54698493264 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0'] 

2.62827137869 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour'] 

#### 20 Trees

1

2.65498502873 ['X', 'Y']

2.63707317793 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0']

2.62140329191 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour']

5

2.55814873108 ['X', 'Y']

2.49239340232 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0']

2.48724280069 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour']

10

2.45995948628 ['X', 'Y']

2.38152136082 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0']

2.37544011608 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour']

12

2.44159174338 ['X', 'Y']

2.35403608793 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0']

2.35609742867 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour']

13

2.44415458008 ['X', 'Y']

2.34690779479 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0']

2.34795865883 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour']

14

2.46416631795 ['X', 'Y']

2.35253302225 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0']

2.35274031197 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour']

15

2.48506095256 ['X', 'Y']

2.36683234659 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0']

2.36724282083 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour']

16

2.55163994484 ['X', 'Y']

2.40294553465 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0']

2.41081171416 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour']

18

2.76749259945 ['X', 'Y']

2.55098307538 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0']

2.59135290047 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour']

20

3.13784003235 ['X', 'Y']

2.9446285317 ['Minute', 'CornerCrime', 'NBogusReport', 'Y', 'X', 'Year', 'Hour', 'PdD', 'Day', 'Month', 'ST_0']

3.0614008462 ['X', 'Y', 'DoW', 'Year', 'CornerCrime', 'BogusReport', 'PdD', 'ST_0', 'Minute', 'NBogusReport', 'Hour']








