In [1]:
import json

import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import xgboost as xgb

from sklearn.metrics import log_loss
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

from scipy.stats import randint as sp_randint

%matplotlib inline
sns.set(style="whitegrid")

import rental_utils; reload(rental_utils)

<module 'rental_utils' from 'rental_utils.pyc'>

In [None]:
train_raw = rental_utils.clean(pd.read_json('Data/train.json'))
test_raw = rental_utils.clean(pd.read_json('Data/test.json'))

[features, feature_names] = rental_utils.get_features(25, train_raw)

train = rental_utils.add_features(train_raw, features, feature_names)
test = rental_utils.add_features(test_raw, features, feature_names)

train = rental_utils.add_region(train)
test = rental_utils.add_region(test)

train = rental_utils.add_variables(train, train)
test = rental_utils.add_variables(test, train)

dv_county = rental_utils.vectorizer('County', train)
train = rental_utils.one_hot_encode(dv_county, train, 'County')
test = rental_utils.one_hot_encode(dv_county, test, 'County')

dv_name = rental_utils.vectorizer('Name', train)
train = rental_utils.one_hot_encode(dv_name, train, 'Name')
test = rental_utils.one_hot_encode(dv_name, test, 'Name')

dv_region = rental_utils.vectorizer('RegionID', train)
train = rental_utils.one_hot_encode(dv_region, train, 'RegionID')
test = rental_utils.one_hot_encode(dv_region, test, 'RegionID')

independent = (['bathrooms', 'bedrooms', 'rooms', 'price', 'price_per_room'] + feature_names + 
    ['description_length', 'n_features', 'n_photos'] +
    ['created_year', 'created_month', 'created_weekday', 'created_hour'] +
    [x for x in train.columns.values if 'County' in x] +
    [x for x in train.columns.values if 'Name' in x] +
    [x for x in train.columns.values if 'Region' in x]
    )
X_train, X_val, y_train, y_val = train_test_split(train[independent], train['interest_level'], test_size=0.33, random_state=1)

In [None]:
clf = xgb.XGBClassifier()

param_dist = {"learning_rate": [0.05, 0.1, 0.2],
              "n_estimators": sp_randint(10,1000),
              "max_depth": sp_randint(6,10),
              "colsample_bytree": [0.33, 0.5, 0.75, 1]
             }

n_iter_search = 20
random_search = RandomizedSearchCV(clf, verbose=1,
                                   param_distributions=param_dist,
                                   n_iter=n_iter_search)
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [None]:
# Identify optimal hyperparameter values
best_learning_rate     = random_search.best_params_['learning_rate']
best_n_estimators = random_search.best_params_['n_estimators']  
best_max_depth = random_search.best_params_['max_depth']  
best_colsample_bytree = random_search.best_params_['colsample_bytree']  

print("The best performing learning_rate value is:    {}".format(best_learning_rate))
print("The best performing n_estimators value is:     {}".format(best_n_estimators))
print("The best performing max_depth value is:        {}".format(best_max_depth))
print("The best performing colsample_bytree value is: {}".format(best_colsample_bytree))


In [None]:
model = random_search.best_estimator_
preds, probs = rental_utils.predict(model, X_val)
print('Log loss:            ' + str(round(log_loss(y_val, probs), 3)))

In [None]:
pickle.dump(model, open("Models/best_xgb.pkl", "wb"))
model2 = pickle.load(open("Models/best_xgb.pkl", "rb"))