In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_validate, GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor

In [2]:
listing_data = pd.read_csv("modelling_data")
listing_data.head()

Unnamed: 0,name,borough,neighbourhood,room_type_encoded,price,bathrooms,accommodates,bedrooms,price_range_encoded,minimum_nights,...,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,borough_Brooklyn,borough_Manhattan,borough_Queens,borough_Staten Island
0,"Spacious Brooklyn Duplex, Patio + Garden",Brooklyn,Sunset Park,0,275,1.0,4,2.0,2,21,...,0,0,0,0,0,0,1,0,0,0
1,Cozy Clean Guest Room - Family Apt,Manhattan,Upper West Side,1,75,1.0,1,1.0,0,2,...,0,0,0,0,0,0,0,1,0,0
2,BlissArtsSpace!,Brooklyn,Bedford-Stuyvesant,1,60,1.0,2,1.0,1,30,...,0,0,0,0,0,0,1,0,0,0
3,Large Furnished Room Near B'way,Manhattan,Midtown,1,68,1.0,2,1.0,1,2,...,0,0,0,0,0,0,0,1,0,0
4,Uptown Sanctuary w/ Private Bath (Month to Month),Manhattan,East Harlem,1,65,1.0,1,1.0,1,30,...,0,0,0,0,0,0,0,1,0,0


In [3]:
X_train, X_test, y_train, y_test = train_test_split(listing_data.drop(columns='price'),
                                                    listing_data.price, test_size=0.3,
                                                    random_state=18)


names_list = ['name', 'borough','neighbourhood']
names_train = X_train[names_list]
names_test = X_test[names_list]
X_train.drop(columns=names_list, inplace=True)
X_test.drop(columns=names_list, inplace=True)
X_train.shape, X_test.shape

((26221, 232), (11238, 232))

In the previous step, we did a basic random forest regression which had RSME of $62.69 which isn't great consider the average listing price was $178. 

Now I will to try find a better performing choice of parameters using GridSearchCV.

One thing to consider is the time each model will take to run as this is a larger dataset and the initial model took 15s to run.

In [5]:

## Define Grid 
grid = { 
    'n_estimators': [100,500,1000],
    'max_depth' : [10,20,30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'random_state': [18]
    }

CV_rfr = GridSearchCV(estimator=RandomForestRegressor(), param_grid=grid, cv= 5, n_jobs=-1, verbose=1)
CV_rfr.fit(X_train, y_train)


print("Best parameters: ", CV_rfr.best_params_)
print("Best score: ", CV_rfr.best_score_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters:  {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 500, 'random_state': 18}
Best score:  0.8661154938439871


Fitting 5 folds for each of 81 candidates, totalling 405 fits. Time of completion: 107m 17.6s

Best parameters:  {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 500, 'random_state': 18}

Best score:  0.8661154938439871