In [1]:
import json

import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import xgboost as xgb

from sklearn.metrics import log_loss
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

from scipy.stats import randint as sp_randint

%matplotlib inline
sns.set(style="whitegrid")

import rental_utils; reload(rental_utils)



<module 'rental_utils' from 'rental_utils.pyc'>

In [2]:
train_raw = rental_utils.clean(pd.read_json('Data/train.json'))
test_raw = rental_utils.clean(pd.read_json('Data/test.json'))

[features, feature_names] = rental_utils.get_features(25, train_raw)

train = rental_utils.add_features(train_raw, features, feature_names)
test = rental_utils.add_features(test_raw, features, feature_names)

train = rental_utils.add_region(train)
test = rental_utils.add_region(test)

train = rental_utils.add_variables(train, train)
test = rental_utils.add_variables(test, train)

dv_county = rental_utils.vectorizer('County', train)
train = rental_utils.one_hot_encode(dv_county, train, 'County')
test = rental_utils.one_hot_encode(dv_county, test, 'County')

dv_name = rental_utils.vectorizer('Name', train)
train = rental_utils.one_hot_encode(dv_name, train, 'Name')
test = rental_utils.one_hot_encode(dv_name, test, 'Name')

dv_region = rental_utils.vectorizer('RegionID', train)
train = rental_utils.one_hot_encode(dv_region, train, 'RegionID')
test = rental_utils.one_hot_encode(dv_region, test, 'RegionID')

independent = (['bathrooms', 'bedrooms', 'rooms', 'price', 'price_per_room'] + feature_names + 
    ['description_length', 'n_features', 'n_photos'] +
    ['created_year', 'created_month', 'created_weekday', 'created_hour'] +
    [x for x in train.columns.values if 'County' in x] +
    [x for x in train.columns.values if 'Name' in x] +
    [x for x in train.columns.values if 'Region' in x]
    )
X_train, X_val, y_train, y_val = train_test_split(train[independent], train['interest_level'], test_size=0.33, random_state=1)

In [3]:
clf = xgb.XGBClassifier()

param_dist = {"learning_rate": [0.05, 0.1, 0.2],
              "n_estimators": sp_randint(10,100),
              "max_depth": sp_randint(6,100),
              "colsample_bytree": [0.1, 0.2, 0.33, 0.5, 1]
             }

n_iter_search = 10
random_search = RandomizedSearchCV(clf, verbose=1,
                                   param_distributions=param_dist,
                                   n_iter=n_iter_search)
random_search.fit(X_val, y_val)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] n_estimators=93, learning_rate=0.2, colsample_bytree=0.33, max_depth=21 
[CV]  n_estimators=93, learning_rate=0.2, colsample_bytree=0.33, max_depth=21, total=  10.7s
[CV] n_estimators=93, learning_rate=0.2, colsample_bytree=0.33, max_depth=21 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.1s remaining:    0.0s


[CV]  n_estimators=93, learning_rate=0.2, colsample_bytree=0.33, max_depth=21, total=   9.6s
[CV] n_estimators=93, learning_rate=0.2, colsample_bytree=0.33, max_depth=21 
[CV]  n_estimators=93, learning_rate=0.2, colsample_bytree=0.33, max_depth=21, total=  10.0s
[CV] n_estimators=92, learning_rate=0.05, colsample_bytree=0.2, max_depth=56 
[CV]  n_estimators=92, learning_rate=0.05, colsample_bytree=0.2, max_depth=56, total=   8.4s
[CV] n_estimators=92, learning_rate=0.05, colsample_bytree=0.2, max_depth=56 
[CV]  n_estimators=92, learning_rate=0.05, colsample_bytree=0.2, max_depth=56, total=   8.3s
[CV] n_estimators=92, learning_rate=0.05, colsample_bytree=0.2, max_depth=56 
[CV]  n_estimators=92, learning_rate=0.05, colsample_bytree=0.2, max_depth=56, total=   8.2s
[CV] n_estimators=22, learning_rate=0.05, colsample_bytree=0.1, max_depth=86 
[CV]  n_estimators=22, learning_rate=0.05, colsample_bytree=0.1, max_depth=86, total=   1.0s
[CV] n_estimators=22, learning_rate=0.05, colsample_

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  3.8min finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff57b923f90>, 'learning_rate': [0.05, 0.1, 0.2], 'colsample_bytree': [0.1, 0.2, 0.33, 0.55], 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff57b941290>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=2)

In [4]:
# Identify optimal hyperparameter values
best_learning_rate     = random_search.best_params_['learning_rate']
best_n_estimators = random_search.best_params_['n_estimators']  
best_max_depth = random_search.best_params_['max_depth']  
best_colsample_bytree = random_search.best_params_['colsample_bytree']  

print("The best performing learning_rate value is:    {}".format(best_learning_rate))
print("The best performing n_estimators value is:     {}".format(best_n_estimators))
print("The best performing max_depth value is:        {}".format(best_max_depth))
print("The best performing colsample_bytree value is: {}".format(best_colsample_bytree))


The best performing learning_rate value is:    0.05
The best performing n_estimators value is:     74
The best performing max_depth value is:        37
The best performing colsample_bytree value is: 0.55


In [6]:
model = random_search.best_estimator_
preds, probs = rental_utils.predict(model, X_val)
print('Log loss:            ' + str(round(log_loss(y_val, probs), 3)))

Log loss:            0.148


In [7]:
pickle.dump(model, open("Models/best_xgb.pkl", "wb"))
model2 = pickle.load(open("Models/best_xgb.pkl", "rb"))