In [1]:
import json

import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import log_loss
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

from scipy.stats import randint as sp_randint

%matplotlib inline
sns.set(style="whitegrid")

import rental_utils; reload(rental_utils)

<module 'rental_utils' from 'rental_utils.pyc'>

In [2]:
train_raw = rental_utils.clean(pd.read_json('Data/train.json'))
test_raw = rental_utils.clean(pd.read_json('Data/test.json'))

[features, feature_names] = rental_utils.get_features(25, train_raw)

train = rental_utils.add_features(train_raw, features, feature_names)
test = rental_utils.add_features(test_raw, features, feature_names)

train = rental_utils.add_region(train)
test = rental_utils.add_region(test)

train = rental_utils.add_variables(train, train)
test = rental_utils.add_variables(test, train)

dv_county = rental_utils.vectorizer('County', train)
train = rental_utils.one_hot_encode(dv_county, train, 'County')
test = rental_utils.one_hot_encode(dv_county, test, 'County')

dv_name = rental_utils.vectorizer('Name', train)
train = rental_utils.one_hot_encode(dv_name, train, 'Name')
test = rental_utils.one_hot_encode(dv_name, test, 'Name')

dv_region = rental_utils.vectorizer('RegionID', train)
train = rental_utils.one_hot_encode(dv_region, train, 'RegionID')
test = rental_utils.one_hot_encode(dv_region, test, 'RegionID')

independent = (['bathrooms', 'bedrooms', 'rooms', 'price', 'price_per_room'] + feature_names + 
    ['description_length', 'n_features', 'n_photos'] +
    ['created_year', 'created_month', 'created_weekday', 'created_hour'] +
    [x for x in train.columns.values if 'County' in x] +
    [x for x in train.columns.values if 'Name' in x] +
    [x for x in train.columns.values if 'Region' in x]
    )
X_train, X_val, y_train, y_val = train_test_split(train[independent], train['interest_level'], test_size=0.33)

In [3]:
clf = RandomForestClassifier()

param_dist = {"n_estimators": sp_randint(10,100),
              "max_depth": sp_randint(10,100),
              "max_features": [0.1, 0.2, 0.33, 0.5],
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)
random_search.fit(X_val, y_val)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=20, n_jobs=1,
          param_distributions={'bootstrap': [True, False], 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000000CA4AB38>, 'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000000CA4A908>, 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000000000CA4AC50>, 'criterion': ['gini', 'entropy'], 'max_features': [0.1, 0.2, 0.33, 0.5], 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000

In [4]:
# Identify optimal hyperparameter values
best_n_estim      = random_search.best_params_['n_estimators']
best_max_features = random_search.best_params_['max_features']  
best_max_depth = random_search.best_params_['max_depth']  
best_min_samples_leaf = random_search.best_params_['min_samples_leaf']  

print("The best performing n_estimators value is:     {}".format(best_n_estim))
print("The best performing max_features value is:     {}".format(best_max_features))
print("The best performing max_depth value is:        {}".format(best_max_depth))
print("The best performing min_samples_leaf value is: {}".format(best_min_samples_leaf))


The best performing n_estimators value is:     83
The best performing max_features value is:     0.33
The best performing max_depth value is:        24
The best performing min_samples_leaf value is: 10
