Use grid search for better parameters.

- Can we make a better RF model? **yes**

We can improve from the default model by increasing the number of trees and tuning the parameters of the model.

In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import make_scorer
from kaggle.data import load_data, save_submission
from kaggle.metrics import rmse_log

In [2]:
# load data
y, X, X_submission, ids_submission = load_data()

# parameters for tuning
parameters = {'n_estimators': [50],
              'max_features': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
              'max_depth': [7, 9, 11, 13, 15],
              'min_samples_leaf': [1, 2, 3, 4]}

# define grid search model
model = GridSearchCV(RandomForestRegressor(random_state=1773, n_jobs=4),
                     parameters, 
                     scoring=make_scorer(rmse_log, greater_is_better=False), 
                     cv=KFold(5, random_state=1773))

In [4]:
%%time
# train grid search model
model.fit(X, y)

CPU times: user 5min 30s, sys: 21.1 s, total: 5min 51s
Wall time: 5min 34s


GridSearchCV(cv=KFold(n_splits=5, random_state=1773, shuffle=False),
       error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=4, oob_score=False, random_state=1773,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [50], 'max_features': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8], 'max_depth': [7, 9, 11, 13, 15], 'min_samples_leaf': [1, 2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(rmse_log, greater_is_better=False), verbose=0)

In [5]:
print 'best score:', model.best_score_
print 'best parameters:', model.best_params_

best score: -0.143705376285
best parameters: {'max_features': 0.5, 'n_estimators': 50, 'max_depth': 13, 'min_samples_leaf': 2}


In [6]:
# create predictions
y_pred = model.predict(X_submission)

# save submission
save_submission(ids_submission, y_pred, '03_grid_search_random_forest')