In [57]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from preprocessing_wrapper import (
    load_preprocessed_data
)
from stats import (print_stats)

In [58]:
data = load_preprocessed_data(cleaning = True, missing_value = True, cat_encoding = True,
                           scaling = False, OneHotEncoding = False, LabelEncoding = True)

In [61]:
data.columns

Index(['Host_Response_Time', 'Host_Response_Rate', 'Is_Superhost', 'Latitude',
       'Longitude', 'Is_Exact_Location', 'Accomodates', 'Bathrooms',
       'Bedrooms', 'Beds', 'Guests_Included', 'Min_Nights', 'Reviews',
       'Overall_Rating', 'Accuracy_Rating', 'Cleanliness_Rating',
       'Checkin_Rating', 'Communication_Rating', 'Location_Rating',
       'Value_Rating', 'Instant_Bookable', 'Business_Travel_Ready', 'Price',
       'Relative_Last_Review', 'Relative_First_Review', 'Relative_Host_Since',
       'Label_Encoder_neighbourhood', 'Label_Encoder_Neighborhood_Group',
       'Label_Encoder_Property_Type', 'Label_Encoder_Room_Type'],
      dtype='object')

In [60]:
data = data.drop(['Postal_Code','Listing_ID', 'Host_ID'], axis = 1)

In [62]:
y = data['Price']
X = data.drop('Price', axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [66]:
for i in range (1,15):
    regressor = tree.DecisionTreeRegressor(max_depth = i)
    regressor = regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
    print(i, RMSE)

1 42.97956033349579
2 39.349192362152344
3 38.78730501352502
4 39.26013552857095
5 39.23223865986306
6 41.701132099984264
7 42.92353381610622
8 41.31830036163291
9 45.97627588304478
10 46.04846992441144
11 46.69260539701138
12 50.92846101078137
13 50.132493760714475
14 52.7320675869878


In [103]:
import xgboost
regressor = xgboost.XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
regressor = regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
print(RMSE)

36.88953128233412


In [101]:
from sklearn.ensemble import RandomForestRegressor
for i in range (200,300,10):
    for j in range (10,100,10):
        regressor = RandomForestRegressor(bootstrap = True,
        max_depth = j,
        max_features= 2,
        min_samples_leaf=4,
        min_samples_split= 8,
        n_estimators= i)
        regressor = regressor.fit(X_train, y_train)
        y_pred = regressor.predict(X_test)
        RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
        print(i,j,RMSE)

200 10 37.63776595772406
200 20 36.91972506364946
200 30 37.043634302232135
200 40 36.952257048786116
200 50 36.95927393289142
200 60 36.8814870895223
200 70 36.99408120359837
200 80 36.88611882782178
200 90 36.89267062560892
210 10 37.38311626204062
210 20 36.93456254538285
210 30 36.90985411921487
210 40 36.78740172328033
210 50 36.93589880378338
210 60 36.90651120678123
210 70 36.959710656411566
210 80 36.89543091841756
210 90 36.90244554929735
220 10 37.57165840248491
220 20 36.848457110738984
220 30 36.85847822587967
220 40 36.95943795785076
220 50 36.922066494909366
220 60 36.87947243654253
220 70 36.85759774043146
220 80 36.93159335129698
220 90 36.91743341237417
230 10 37.50647128144712
230 20 36.86567391288748
230 30 36.97547252706123
230 40 36.764465412961506
230 50 36.91147390737041
230 60 36.88581188346585
230 70 37.00983059237351
230 80 36.83766991168047
230 90 36.81603457989419
240 10 37.61348282686143
240 20 36.91454806671814


KeyboardInterrupt: 

In [89]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [20,100,300,400],
    'max_features': [1, 2],
    'min_samples_leaf': [4],
    'min_samples_split': [8],
    'n_estimators': [100, 200, 300,400 ]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


KeyboardInterrupt: 

In [88]:
grid_search.best_params_


{'bootstrap': True,
 'max_depth': 300,
 'max_features': 2,
 'min_samples_leaf': 4,
 'min_samples_split': 8,
 'n_estimators': 200}

In [68]:
len(X.columns)

29

In [69]:
from sklearn.tree import export_graphviz

export_graphviz(regressor, out_file ='tree.dot',
               feature_names =['Host_Response_Time', 'Host_Response_Rate', 'Is_Superhost', 'Latitude',
       'Longitude', 'Is_Exact_Location', 'Accomodates', 'Bathrooms',
       'Bedrooms', 'Beds', 'Guests_Included', 'Min_Nights', 'Reviews',
       'Overall_Rating', 'Accuracy_Rating', 'Cleanliness_Rating',
       'Checkin_Rating', 'Communication_Rating', 'Location_Rating',
       'Value_Rating', 'Instant_Bookable', 'Business_Travel_Ready', 
       'Relative_Last_Review', 'Relative_First_Review', 'Relative_Host_Since',
       'Label_Encoder_neighbourhood', 'Label_Encoder_Neighborhood_Group',
       'Label_Encoder_Property_Type', 'Label_Encoder_Room_Type']) 

In [70]:
from sklearn.tree import export_graphviz
from IPython.display import Image 
import pydotplus

# Function to create a tree diagram
def create_tree_graph_png(tree, feature_names):
    tree_str = export_graphviz(tree, feature_names=feature_names, filled=True, out_file=None)
    graph = pydotplus.graph_from_dot_data(tree_str)  
    graph.write_png('tree.png')
    return Image(graph.create_png())

# Display the tree
create_tree_graph_png(regressor, feature_names= ['Host_Response_Time', 'Host_Response_Rate', 'Is_Superhost', 'Latitude',
       'Longitude', 'Is_Exact_Location', 'Accomodates', 'Bathrooms',
       'Bedrooms', 'Beds', 'Guests_Included', 'Min_Nights', 'Reviews',
       'Overall_Rating', 'Accuracy_Rating', 'Cleanliness_Rating',
       'Checkin_Rating', 'Communication_Rating', 'Location_Rating',
       'Value_Rating', 'Instant_Bookable', 'Business_Travel_Ready', 
       'Relative_Last_Review', 'Relative_First_Review', 'Relative_Host_Since',
       'Label_Encoder_neighbourhood', 'Label_Encoder_Neighborhood_Group',
       'Label_Encoder_Property_Type', 'Label_Encoder_Room_Type'])

InvocationException: GraphViz's executables not found

In [23]:
print_stats(data)

Unnamed: 0,min,max,mean,std,median,nunique,count_na
Listing_ID,22415.0,34674496.0,17797450.0,9953563.0,18646640.0,15659,0
Host_ID,11015.0,260969848.0,61149860.0,67375190.0,33554750.0,15662,0
Host_Response_Time,0,3,1.213366,1.300912,1.0,4,0
Host_Response_Rate,0.0,100.0,92.22836,14.04489,92.22836,34,0
Is_Superhost,True,True,1.0,0.0,1.0,1,0
Latitude,52.36927,52.63967,52.5099,0.03129251,52.50912,9522,0
Longitude,13.1214,13.70902,13.40737,0.05864067,13.41727,10581,0
Is_Exact_Location,True,True,1.0,0.0,1.0,1,0
Accomodates,1.0,16.0,2.62288,1.401251,2.0,12,0
Bathrooms,0.0,3.0,1.08768,0.2774891,1.0,7,0
