# Tree Models
### Author: Jainam Mehta

In [84]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Load data and prepare it for the RandomForest Model

In [85]:
# Load the data
df = pd.read_csv("Data/data.csv")
df.head()

Unnamed: 0,id,hos2_is_superhos2,host_listings_count,neighbourhood_cleansed,neighbourhood_group_cleansed,city,zipcode,property_type,room_type,accommodates,...,maximum_nights_avg_ntm,number_of_reviews,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
0,2595,1.0,6.0,Midtown,Manhattan,New York,10018,Apartment,Entire home/apt,1,...,1125.0,48,7.0,94.0,9.0,9.0,10.0,10.0,10.0,9.0
1,3831,1.0,1.0,Clinton Hill,Brooklyn,Brooklyn,11238,Guest suite,Entire home/apt,3,...,730.0,295,75.0,90.0,9.0,9.0,10.0,9.0,10.0,9.0
2,5099,1.0,1.0,Murray Hill,Manhattan,New York,10016,Apartment,Entire home/apt,2,...,21.0,78,8.0,90.0,10.0,9.0,10.0,10.0,10.0,9.0
3,5178,1.0,1.0,Hell's Kitchen,Manhattan,New York,10019,Apartment,Private room,2,...,14.0,454,47.0,84.0,9.0,7.0,9.0,9.0,10.0,8.0
4,5238,2.0,4.0,Chinatown,Manhattan,New York,10002,Apartment,Entire home/apt,3,...,1125.0,161,9.0,94.0,10.0,9.0,10.0,10.0,9.0,9.0


In [86]:
# Descriptive statistics for each column
df.describe()

Unnamed: 0,id,hos2_is_superhos2,host_listings_count,accommodates,bathrooms,bedrooms,beds,price,guests_included,minimum_nights,...,maximum_nights_avg_ntm,number_of_reviews,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
count,28268.0,28268.0,28268.0,28268.0,28268.0,28268.0,28268.0,28268.0,28268.0,28268.0,...,28268.0,28268.0,28268.0,28268.0,28268.0,28268.0,28268.0,28268.0,28268.0,28268.0
mean,22984580.0,1.312509,8.427268,2.953976,1.134339,1.186819,1.600255,129.525647,1.641326,5.757677,...,10386.68,39.998054,16.066648,94.03251,9.636939,9.328216,9.759764,9.751097,9.609276,9.395323
std,12152850.0,0.463524,56.331599,1.870513,0.391389,0.722178,1.146669,85.643423,1.212901,15.842635,...,673380.7,56.912793,18.678909,8.043051,0.793117,0.974955,0.68179,0.722382,0.695249,0.877691
min,2595.0,1.0,0.0,1.0,0.0,0.0,0.0,10.0,1.0,1.0,...,1.0,1.0,1.0,20.0,2.0,2.0,2.0,2.0,2.0,2.0
25%,13642440.0,1.0,1.0,2.0,1.0,1.0,1.0,68.0,1.0,1.0,...,29.0,5.0,3.0,92.0,9.0,9.0,10.0,10.0,9.0,9.0
50%,25026260.0,1.0,1.0,2.0,1.0,1.0,1.0,100.0,1.0,2.0,...,365.0,17.0,9.0,96.0,10.0,10.0,10.0,10.0,10.0,10.0
75%,33611990.0,2.0,3.0,4.0,1.0,1.0,2.0,168.0,2.0,4.0,...,1125.0,51.0,24.0,99.0,10.0,10.0,10.0,10.0,10.0,10.0
max,40513120.0,2.0,1767.0,16.0,7.0,21.0,26.0,500.0,16.0,1125.0,...,49481220.0,675.0,407.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0


In [87]:
# drop unecessary columns
df = df.drop(['neighbourhood_cleansed', 'city', 'security_deposit', 'cleaning_fee', 'extra_people'], axis=1)
df.columns
#df = df.drop(['neighbourhood_cleansed', 'city', 'zipcode', 'security_deposit', 'cleaning_fee', 'extra_people'], axis=1)
#df.columns

Index(['id', 'hos2_is_superhos2', 'host_listings_count',
       'neighbourhood_group_cleansed', 'zipcode', 'property_type', 'room_type',
       'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type',
       'amenities', 'price', 'guests_included', 'minimum_nights',
       'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights',
       'minimum_maximum_nights', 'maximum_maximum_nights',
       'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'number_of_reviews',
       'number_of_reviews_ltm', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value'],
      dtype='object')

In [88]:
# replace non-numeric zipcodes with -999999
df['zipcode'] = (pd.to_numeric(df['zipcode'], errors='coerce').fillna(-999999))
df.count()

id                              28268
hos2_is_superhos2               28268
host_listings_count             28268
neighbourhood_group_cleansed    28268
zipcode                         28268
property_type                   28268
room_type                       28268
accommodates                    28268
bathrooms                       28268
bedrooms                        28268
beds                            28268
bed_type                        28268
amenities                       28268
price                           28268
guests_included                 28268
minimum_nights                  28268
maximum_nights                  28268
minimum_minimum_nights          28268
maximum_minimum_nights          28268
minimum_maximum_nights          28268
maximum_maximum_nights          28268
minimum_nights_avg_ntm          28268
maximum_nights_avg_ntm          28268
number_of_reviews               28268
number_of_reviews_ltm           28268
review_scores_rating            28268
review_score

In [89]:
# remove non-numeric zipcodes
#df = df[df['zipcode'].str.isnumeric()].reset_index(drop=True)
#df['zipcode'] = df['zipcode'].astype(int)
#df.count()

In [90]:
# try a smaller subset of rows, with only those with more than 5 reviews in the last 12 months
#df = df[df['number_of_reviews_ltm'] >= 5].reset_index(drop=True)
#df['number_of_reviews_ltm'].count()

**Handling the amenities**

In [91]:
# split up amenities
amenities_df = df[['id', 'amenities', 'price', 'number_of_reviews']].copy()
amenities_df['amenities_list'] = [x[1:-1].replace("'",'').replace('"', '').replace(' ', '').split(',') 
                                   for x in amenities_df['amenities']]

amenities_df.head()

Unnamed: 0,id,amenities,price,number_of_reviews,amenities_list
0,2595,"{TV,Wifi,""Air conditioning"",Kitchen,""Paid park...",225.0,48,"[TV, Wifi, Airconditioning, Kitchen, Paidparki..."
1,3831,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",89.0,295,"[TV, CableTV, Internet, Wifi, Airconditioning,..."
2,5099,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",200.0,78,"[TV, CableTV, Internet, Wifi, Airconditioning,..."
3,5178,"{TV,Wifi,""Air conditioning"",""Paid parking off ...",79.0,454,"[TV, Wifi, Airconditioning, Paidparkingoffprem..."
4,5238,"{TV,""Cable TV"",Wifi,""Air conditioning"",Kitchen...",150.0,161,"[TV, CableTV, Wifi, Airconditioning, Kitchen, ..."


In [92]:
# Find all distinct amenities & calculate amenity coverages
all_amenities = []
for n in amenities_df['amenities_list']:
    all_amenities += n
all_amenities = np.unique(np.array(all_amenities),return_counts = 1)
all_amenities = pd.Series(all_amenities[1]/df.shape[0], index = all_amenities[0])
all_amenities = all_amenities.sort_values(ascending = False)

In [93]:
# Total 130 amenities
a_list = all_amenities.index # This list will be used to create columns
len(a_list)

130

In [94]:
range(amenities_df.shape[0])
amenities_df['id'][4406:4409]

4406    6969011
4407    6973286
4408    6973292
Name: id, dtype: int64

In [95]:
N = amenities_df.shape[0]
amenities_bin = [0]*N
for i in range(N):
    candidate = amenities_df['amenities_list'][i]
    amenities_bin[i] = [n in candidate for n in a_list]
    if i%10000 == 0:
        print("{} rows completed".format(i))

0 rows completed
10000 rows completed
20000 rows completed


In [96]:
# Combine binary variables with the amenity dataframe
col_names = 'amenities_' + all_amenities.index # This list will be used to create columns
amenities_df = pd.concat([amenities_df, pd.DataFrame(amenities_bin, columns = col_names).astype('int')], axis = 1)

amenities_df.head()

Unnamed: 0,id,amenities,price,number_of_reviews,amenities_list,amenities_Wifi,amenities_Essentials,amenities_Heating,amenities_Smokedetector,amenities_Kitchen,...,amenities_Ski-in/Ski-out,amenities_,amenities_Privatebathroom,amenities_Groundflooraccess,amenities_Washer/Dryer,amenities_Mobilehoist,amenities_Poolwithpoolhoist,amenities_Ceilinghoist,amenities_Airpurifier,amenities_Kitchenette
0,2595,"{TV,Wifi,""Air conditioning"",Kitchen,""Paid park...",225.0,48,"[TV, Wifi, Airconditioning, Kitchen, Paidparki...",1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,3831,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",89.0,295,"[TV, CableTV, Internet, Wifi, Airconditioning,...",1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,5099,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",200.0,78,"[TV, CableTV, Internet, Wifi, Airconditioning,...",1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,5178,"{TV,Wifi,""Air conditioning"",""Paid parking off ...",79.0,454,"[TV, Wifi, Airconditioning, Paidparkingoffprem...",1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5238,"{TV,""Cable TV"",Wifi,""Air conditioning"",Kitchen...",150.0,161,"[TV, CableTV, Wifi, Airconditioning, Kitchen, ...",1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [97]:
amenities_df.shape

(28268, 135)

In [98]:
# Export as csv file
amenities_df.to_csv('Data/amenities_df.csv') 

In [99]:
# Drop amenities column from main df and add dummified amenities
df = df.drop('amenities', axis=1)
df = pd.concat([df, pd.DataFrame(amenities_bin, columns = col_names).astype('int')], axis = 1)
df.head()

Unnamed: 0,id,hos2_is_superhos2,host_listings_count,neighbourhood_group_cleansed,zipcode,property_type,room_type,accommodates,bathrooms,bedrooms,...,amenities_Ski-in/Ski-out,amenities_,amenities_Privatebathroom,amenities_Groundflooraccess,amenities_Washer/Dryer,amenities_Mobilehoist,amenities_Poolwithpoolhoist,amenities_Ceilinghoist,amenities_Airpurifier,amenities_Kitchenette
0,2595,1.0,6.0,Manhattan,10018.0,Apartment,Entire home/apt,1,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,3831,1.0,1.0,Brooklyn,11238.0,Guest suite,Entire home/apt,3,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,5099,1.0,1.0,Manhattan,10016.0,Apartment,Entire home/apt,2,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
3,5178,1.0,1.0,Manhattan,10019.0,Apartment,Private room,2,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,5238,2.0,4.0,Manhattan,10002.0,Apartment,Entire home/apt,3,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0


### Create train/test split

In [100]:
# Split data into dependent and predictor variables
y = df['price']
print(y.head())

X = df.drop('price', axis=1)
X.head()

0    225.0
1     89.0
2    200.0
3     79.0
4    150.0
Name: price, dtype: float64


Unnamed: 0,id,hos2_is_superhos2,host_listings_count,neighbourhood_group_cleansed,zipcode,property_type,room_type,accommodates,bathrooms,bedrooms,...,amenities_Ski-in/Ski-out,amenities_,amenities_Privatebathroom,amenities_Groundflooraccess,amenities_Washer/Dryer,amenities_Mobilehoist,amenities_Poolwithpoolhoist,amenities_Ceilinghoist,amenities_Airpurifier,amenities_Kitchenette
0,2595,1.0,6.0,Manhattan,10018.0,Apartment,Entire home/apt,1,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,3831,1.0,1.0,Brooklyn,11238.0,Guest suite,Entire home/apt,3,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,5099,1.0,1.0,Manhattan,10016.0,Apartment,Entire home/apt,2,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
3,5178,1.0,1.0,Manhattan,10019.0,Apartment,Private room,2,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,5238,2.0,4.0,Manhattan,10002.0,Apartment,Entire home/apt,3,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0


In [101]:
# one-hot encode the data 
X = pd.get_dummies(X)
predictor_list = X.columns.tolist()

# check the columns have been dummified
X.iloc[:, 155:165].head(10)

Unnamed: 0,amenities_Kitchenette,neighbourhood_group_cleansed_Bronx,neighbourhood_group_cleansed_Brooklyn,neighbourhood_group_cleansed_Manhattan,neighbourhood_group_cleansed_Queens,neighbourhood_group_cleansed_Staten Island,property_type_Aparthotel,property_type_Apartment,property_type_Barn,property_type_Bed and breakfast
0,0,0,0,1,0,0,0,1,0,0
1,0,0,1,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,1,0,0
3,0,0,0,1,0,0,0,1,0,0
4,0,0,0,1,0,0,0,1,0,0
5,0,0,0,1,0,0,0,1,0,0
6,0,0,1,0,0,0,0,0,0,0
7,0,0,1,0,0,0,0,1,0,0
8,0,0,0,1,0,0,0,1,0,0
9,0,0,1,0,0,0,0,0,0,0


In [102]:
# check dimensions
print(y.shape)
X.shape

(28268,)


(28268, 199)

In [103]:
# covert to numpy arrays
#y = np.array(y)
#X = np.array(X)

In [104]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size = 0.7, random_state = 123)

In [105]:
X_train.head()

Unnamed: 0,id,hos2_is_superhos2,host_listings_count,zipcode,accommodates,bathrooms,bedrooms,beds,guests_included,minimum_nights,...,property_type_Villa,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room,bed_type_Airbed,bed_type_Couch,bed_type_Futon,bed_type_Pull-out Sofa,bed_type_Real Bed
22409,34790450,1.0,1.0,10014.0,6,1.0,3.0,4.0,1,3,...,0,1,0,0,0,0,0,0,0,1
13750,24380794,1.0,2.0,11211.0,4,1.0,2.0,2.0,1,2,...,0,1,0,0,0,0,0,0,0,1
7816,15095439,1.0,1.0,10039.0,3,1.0,1.0,1.0,1,2,...,0,1,0,0,0,0,0,0,0,1
19952,32195520,1.0,1.0,11101.0,2,1.0,1.0,1.0,1,7,...,0,1,0,0,0,0,0,0,0,1
15072,26573069,1.0,2.0,11221.0,3,1.0,1.0,1.0,1,2,...,0,0,0,1,0,0,0,0,0,1


In [106]:
# check dimensions of resulting split
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (19787, 199)
Training Labels Shape: (19787,)
Testing Features Shape: (8481, 199)
Testing Labels Shape: (8481,)


### Training the model

In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 123)

# Train the model on training data
rf.fit(X_train, y_train);

print('Training R2:', round(rf.score(X_train, y_train), 3))

### Generating predictions on the test set

In [None]:
# Use the forest's predict method on the test data
predictions = rf.predict(X_test)

### Performance metrics

In [None]:
# Calculate the absolute errors
errors = abs(predictions - y_test)

# Print out the mean absolute error (mae)
print('Avg error: $', round(np.mean(errors), 2))

In [None]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

# Calculate R2 value
r2 = rf.score(X_test, y_test)
print('R2:', round(r2, 3))

In [None]:
# Save predictions
predsRF = pd.DataFrame({'id': np.array(X_test['id']), 'actual': np.array(y_test), 'pred': np.array(predictions)}, columns=['id', 'actual', 'pred'])
predsRF.head()
predsRF.to_csv('predsRF.csv', index = False) 

### Checking for predictor importance

In [None]:
# Get numerical feature importances
importances = list(rf.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(predictor_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

feature_importances = pd.DataFrame(feature_importances, columns=['feature', 'importance'])

In [None]:
# Plot out importances

# Import matplotlib for plotting 
import matplotlib.pyplot as plt
%matplotlib inline

# Set the style
plt.style.use('fivethirtyeight')
plt.figure(figsize=(6,10))

top_n = 10

# Make a bar chart
plt.barh(range(top_n), feature_importances['importance'][:top_n]) #  orientation = 'horizontal'

# Tick labels for x axis
plt.yticks(range(top_n), feature_importances['feature'][:top_n]) # , rotation= 'vertical'

# Axis labels and title
plt.ylabel('Variable'); plt.xlabel('Importance'); plt.title('Variable Importances');

### Re-training the model with important predictors

In [None]:
# select only the most important predictors and recreate the model
important_cols = feature_importances[feature_importances['importance'] > 0]['feature'].tolist()

# Subset predictors with only the most important features
X_train_imp = X_train[important_cols]
X_test_imp = X_test[important_cols]

# Interesting to see that the id column actually had an important of 0.02
# However, we drop the id column, as that should have nothing to do with price.
# X_train_imp = X_train_imp.drop('id', axis=1)
# X_test_imp = X_test_imp.drop('id', axis=1)

In [124]:
# New random forest with only the most important variables
rf_imp = RandomForestRegressor(n_estimators = 1000, random_state = 123)

# Train the random forest
rf_imp.fit(X_train_imp, y_train)

print('Training R2:', round(rf_imp.score(X_train_imp, y_train), 3))

Training R2: 0.956


In [127]:
# Make predictions and determine the error
predictions = rf_imp.predict(X_test_imp) 

In [128]:
# Calculate the absolute errors
errors = abs(predictions - y_test)

# Display the performance metrics
print('Avg error: $', round(np.mean(errors), 2))

mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

r2 = rf_imp.score(X_test_imp, y_test)
print('R2:', round(r2, 3))

Avg error: $ 32.68
Accuracy: 71.2 %.
R2: 0.654


**Summary of steps taken to improve accuracy:**
1. Select only the most important variables

2. Add zipcode data back into the model  
Adds about 2% in accuracy

3. Further subset the data with number_of_reviews_ltm > 5  
Adds about 1.5% in accuracy:  
Avg error: $ 30.22  
Accuracy: 72.87 %.  
R2: 0.691  

4. Hyperparameter tuning

### Hyperparameter tuning

In [129]:
rf = RandomForestRegressor(random_state = 123)
from pprint import pprint

# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())

# n_estimators = number of trees in the forest
# max_features = max number of features considered for splitting a node
# max_depth = max number of levels in each decision tree
# min_samples_split = min number of data points placed in a node before the node is split
# min_samples_leaf = min number of data points allowed in a leaf node
# bootstrap = method for sampling data points (with or without replacement)

Parameters currently in use:

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 'warn',
 'n_jobs': None,
 'oob_score': False,
 'random_state': 123,
 'verbose': 0,
 'warm_start': False}


In [130]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 500, stop = 1500, num = 5)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(40, 110, num = 8)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [500, 750, 1000, 1250, 1500]}


In [131]:
# Use the random grid to search for best hyperparameters

# First create the base model to tune
rf = RandomForestRegressor()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=123, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train_imp, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 34.5min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_sta...


In [132]:
# See results
rf_random.best_params_

{'n_estimators': 1250,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': 60,
 'bootstrap': True}

In [133]:
# Evaluate if the random search did indeed produce a better model

def evaluate(model, X_test_imp, y_test):
    predictions = model.predict(X_test_imp)
    errors = abs(predictions - y_test)
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

base_model = RandomForestRegressor(n_estimators = 10, random_state = 123)
base_model.fit(X_train_imp, y_train)
base_accuracy = evaluate(base_model, X_test_imp, y_test)

best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test_imp, y_test)

print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

Model Performance
Average Error: 34.5839 degrees.
Accuracy = 69.73%.
Model Performance
Average Error: 32.3816 degrees.
Accuracy = 71.58%.
Improvement of 2.65%.


In [135]:
evaluate(rf_imp, X_test_imp, y_test)

Model Performance
Average Error: 32.6818 degrees.
Accuracy = 71.20%.


71.20241968190518

In [138]:
# Grid Search with Cross Validation
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [55, 60, 65],
    'max_features': [2, 3],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [4, 5, 6],
    'n_estimators': [1100, 1200, 1300]
}

# Create a based model
rf = RandomForestRegressor()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [139]:
# Fit the grid search to the data
grid_search.fit(X_train_imp, y_train)
grid_search.best_params_

best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test_imp, y_test)

print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))


Fitting 3 folds for each of 162 candidates, totalling 486 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


KeyboardInterrupt: 

In [141]:
# Final random forest model with tuned hyperparameters
rf_final = RandomForestRegressor(n_estimators = 1250, min_samples_split=5, min_samples_leaf=2, max_depth=60,
                                 random_state = 123)

# Train the random forest
rf_final.fit(X_train_imp, y_train)

print('Training R2:', round(rf_final.score(X_train_imp, y_train), 3))

Training R2: 0.913


In [142]:
# Make predictions and determine the error
predictions = rf_final.predict(X_test_imp) 

In [143]:
# Calculate the absolute errors
errors = abs(predictions - y_test)

# Display the performance metrics
print('Avg error: $', round(np.mean(errors), 2))

mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

r2 = rf_final.score(X_test_imp, y_test)
print('R2:', round(r2, 3))

Avg error: $ 32.41
Accuracy: 71.55 %.
R2: 0.659


### Interpreting the final model and results

In [None]:
# Plot predictions vs actual, grouped by location

In [None]:
# Plot predictions vs actual, grouped by room type