In [18]:
import json

import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

from scipy.stats import randint as sp_randint

%matplotlib inline
sns.set(style="whitegrid")

import rental_utils; reload(rental_utils)

<module 'rental_utils' from 'rental_utils.pyc'>

In [3]:
train_raw = pd.read_json('Data/train.json')
test_raw = pd.read_json('Data/test.json')

[features, feature_names] = rental_utils.get_features(25, train_raw)

train = rental_utils.add_features(train_raw, features, feature_names)
test = rental_utils.add_features(test_raw, features, feature_names)

train = rental_utils.add_region(train)
test = rental_utils.add_region(test)

train = rental_utils.add_variables(train, train)
test = rental_utils.add_variables(test, train)

dv_county = rental_utils.vectorizer('County', train)
train = rental_utils.one_hot_encode(dv_county, train, 'County')
test = rental_utils.one_hot_encode(dv_county, test, 'County')

dv_name = rental_utils.vectorizer('Name', train)
train = rental_utils.one_hot_encode(dv_name, train, 'Name')
test = rental_utils.one_hot_encode(dv_name, test, 'Name')

dv_region = rental_utils.vectorizer('RegionID', train)
train = rental_utils.one_hot_encode(dv_region, train, 'RegionID')
test = rental_utils.one_hot_encode(dv_region, test, 'RegionID')

independent = (['bathrooms', 'bedrooms', 'price'] + feature_names + 
    ['description_length', 'n_features', 'n_photos', 'month'] +
    [x for x in train.columns.values if 'County' in x] +
    [x for x in train.columns.values if 'Name' in x] +
    [x for x in train.columns.values if 'Region' in x]
    )
X_train, X_val, y_train, y_val = train_test_split(train[independent], train['interest_level'], test_size=0.33)

In [27]:
n_estimators = [10, 25, 50]
max_features = [0.1, 0.2, 0.33, 0.5]
max_depth = [10, 20]
min_samples_leaf = [1, 2, 5]

hyperparameters = {
    'n_estimators': n_estimators, 
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_leaf': min_samples_leaf
}

gridCV = GridSearchCV(RandomForestClassifier(), param_grid=hyperparameters, cv=3, n_jobs=4, verbose=1)

gridCV.fit(X_val, y_val)

# Identify optimal hyperparameter values
best_n_estim      = gridCV.best_params_['n_estimators']
best_max_features = gridCV.best_params_['max_features']  
best_max_depth = gridCV.best_params_['max_depth']  
best_min_samples_leaf = gridCV.best_params_['min_samples_leaf']  

print("The best performing n_estimators value is:     {}".format(best_n_estim))
print("The best performing max_features value is:     {}".format(best_max_features))
print("The best performing max_depth value is:        {}".format(best_max_depth))
print("The best performing min_samples_leaf value is: {}".format(best_min_samples_leaf))


Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   38.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  3.3min
[Parallel(n_jobs=4)]: Done 216 out of 216 | elapsed:  4.0min finished


The best performing n_estimators value is:     50
The best performing max_features value is:     0.33
The best performing max_depth value is:        20
The best performing min_samples_leaf value is: 5
