Can we be more effective than a full grid search?

We were scoring across 140 different models before, what happens if we
try to randomly select a sample to score. In practice 60 is normally enought to be
reasonable close to the global optimal.

We should get close to the full grid search result in less than half the time.

In [1]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import make_scorer
from kaggle.data import load_data, save_submission
from kaggle.metrics import rmse

In [2]:
y, X, X_submission, ids_submission = load_data()
y = np.log(y)

In [7]:
parameters = {'n_estimators': [50],
              'max_features': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
              'max_depth': [7, 9, 11, 13, 15],
              'min_samples_leaf': [1, 2, 3, 4]}

model = RandomizedSearchCV(RandomForestRegressor(random_state=1773, n_jobs=4),
                           parameters, 
                           n_iter=60, 
                           random_state=1773,
                           scoring=make_scorer(rmse, greater_is_better=False), 
                           cv=KFold(5, random_state=1773))

In [8]:
%%time
model.fit(X,y)

CPU times: user 2min 21s, sys: 9.14 s, total: 2min 30s
Wall time: 2min 23s


RandomizedSearchCV(cv=KFold(n_splits=5, random_state=1773, shuffle=False),
          error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=4, oob_score=False, random_state=1773,
           verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=60, n_jobs=1,
          param_distributions={'n_estimators': [50], 'max_features': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8], 'max_depth': [7, 9, 11, 13, 15], 'min_samples_leaf': [1, 2, 3, 4]},
          pre_dispatch='2*n_jobs', random_state=1773, refit=True,
          return_train_score=True,
          scoring=make_scorer(rmse, greater_is_better=False), verbose=0)

In [9]:
print 'best score:', model.best_score_
print 'best parameters:', model.best_params_

best score: -0.139821446558
best parameters: {'n_estimators': 50, 'max_features': 0.4, 'max_depth': 15, 'min_samples_leaf': 1}


In [10]:
y_pred = model.predict(X_submission)
save_submission(ids_submission, y_pred, '05_random_search_shifted_random_forest')