# Tree Models
### Author: Jainam Mehta

In [129]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Load data and prepare it for the RandomForest Model

In [130]:
# Load the data
df = pd.read_csv("Data/data.csv")
df.head()

Unnamed: 0,id,hos2_is_superhos2,host_listings_count,neighbourhood_cleansed,neighbourhood_group_cleansed,city,zipcode,property_type,room_type,accommodates,...,maximum_nights_avg_ntm,number_of_reviews,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
0,2595,1.0,6.0,Midtown,Manhattan,New York,10018,Apartment,Entire home/apt,1,...,1125.0,48,7.0,94.0,9.0,9.0,10.0,10.0,10.0,9.0
1,3831,1.0,1.0,Clinton Hill,Brooklyn,Brooklyn,11238,Guest suite,Entire home/apt,3,...,730.0,295,75.0,90.0,9.0,9.0,10.0,9.0,10.0,9.0
2,5099,1.0,1.0,Murray Hill,Manhattan,New York,10016,Apartment,Entire home/apt,2,...,21.0,78,8.0,90.0,10.0,9.0,10.0,10.0,10.0,9.0
3,5178,1.0,1.0,Hell's Kitchen,Manhattan,New York,10019,Apartment,Private room,2,...,14.0,454,47.0,84.0,9.0,7.0,9.0,9.0,10.0,8.0
4,5238,2.0,4.0,Chinatown,Manhattan,New York,10002,Apartment,Entire home/apt,3,...,1125.0,161,9.0,94.0,10.0,9.0,10.0,10.0,9.0,9.0


In [131]:
# Descriptive statistics for each column
df.describe()

Unnamed: 0,id,hos2_is_superhos2,host_listings_count,accommodates,bathrooms,bedrooms,beds,price,guests_included,minimum_nights,...,maximum_nights_avg_ntm,number_of_reviews,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
count,28268.0,28268.0,28268.0,28268.0,28268.0,28268.0,28268.0,28268.0,28268.0,28268.0,...,28268.0,28268.0,28268.0,28268.0,28268.0,28268.0,28268.0,28268.0,28268.0,28268.0
mean,22984580.0,1.312509,8.427268,2.953976,1.134339,1.186819,1.600255,129.525647,1.641326,5.757677,...,10386.68,39.998054,16.066648,94.03251,9.636939,9.328216,9.759764,9.751097,9.609276,9.395323
std,12152850.0,0.463524,56.331599,1.870513,0.391389,0.722178,1.146669,85.643423,1.212901,15.842635,...,673380.7,56.912793,18.678909,8.043051,0.793117,0.974955,0.68179,0.722382,0.695249,0.877691
min,2595.0,1.0,0.0,1.0,0.0,0.0,0.0,10.0,1.0,1.0,...,1.0,1.0,1.0,20.0,2.0,2.0,2.0,2.0,2.0,2.0
25%,13642440.0,1.0,1.0,2.0,1.0,1.0,1.0,68.0,1.0,1.0,...,29.0,5.0,3.0,92.0,9.0,9.0,10.0,10.0,9.0,9.0
50%,25026260.0,1.0,1.0,2.0,1.0,1.0,1.0,100.0,1.0,2.0,...,365.0,17.0,9.0,96.0,10.0,10.0,10.0,10.0,10.0,10.0
75%,33611990.0,2.0,3.0,4.0,1.0,1.0,2.0,168.0,2.0,4.0,...,1125.0,51.0,24.0,99.0,10.0,10.0,10.0,10.0,10.0,10.0
max,40513120.0,2.0,1767.0,16.0,7.0,21.0,26.0,500.0,16.0,1125.0,...,49481220.0,675.0,407.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0


In [132]:
# drop unecessary columns
df = df.drop(['neighbourhood_cleansed', 'city', 'zipcode', 'security_deposit', 'cleaning_fee', 'extra_people'], axis=1)
df.columns

Index(['id', 'hos2_is_superhos2', 'host_listings_count',
       'neighbourhood_group_cleansed', 'property_type', 'room_type',
       'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type',
       'amenities', 'price', 'guests_included', 'minimum_nights',
       'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights',
       'minimum_maximum_nights', 'maximum_maximum_nights',
       'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'number_of_reviews',
       'number_of_reviews_ltm', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value'],
      dtype='object')

In [133]:
# split up amenities
# for now ignore amenities
df = df.drop('amenities', axis=1)
df.columns

Index(['id', 'hos2_is_superhos2', 'host_listings_count',
       'neighbourhood_group_cleansed', 'property_type', 'room_type',
       'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'price',
       'guests_included', 'minimum_nights', 'maximum_nights',
       'minimum_minimum_nights', 'maximum_minimum_nights',
       'minimum_maximum_nights', 'maximum_maximum_nights',
       'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'number_of_reviews',
       'number_of_reviews_ltm', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value'],
      dtype='object')

In [134]:
# Split data into dependent and predictor variables
y = df['price']
print(y.head())

X = df.drop('price', axis=1)
X.head()

0    225.0
1     89.0
2    200.0
3     79.0
4    150.0
Name: price, dtype: float64


Unnamed: 0,id,hos2_is_superhos2,host_listings_count,neighbourhood_group_cleansed,property_type,room_type,accommodates,bathrooms,bedrooms,beds,...,maximum_nights_avg_ntm,number_of_reviews,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
0,2595,1.0,6.0,Manhattan,Apartment,Entire home/apt,1,1.0,0.0,1.0,...,1125.0,48,7.0,94.0,9.0,9.0,10.0,10.0,10.0,9.0
1,3831,1.0,1.0,Brooklyn,Guest suite,Entire home/apt,3,1.0,1.0,4.0,...,730.0,295,75.0,90.0,9.0,9.0,10.0,9.0,10.0,9.0
2,5099,1.0,1.0,Manhattan,Apartment,Entire home/apt,2,1.0,1.0,1.0,...,21.0,78,8.0,90.0,10.0,9.0,10.0,10.0,10.0,9.0
3,5178,1.0,1.0,Manhattan,Apartment,Private room,2,1.0,1.0,1.0,...,14.0,454,47.0,84.0,9.0,7.0,9.0,9.0,10.0,8.0
4,5238,2.0,4.0,Manhattan,Apartment,Entire home/apt,3,1.0,1.0,2.0,...,1125.0,161,9.0,94.0,10.0,9.0,10.0,10.0,9.0,9.0


In [135]:
# one-hot encode the data 
X = pd.get_dummies(X)
predictor_list = X.columns.tolist()

# check the columns have been dummified
X.iloc[:, 25:35].head(10)

Unnamed: 0,neighbourhood_group_cleansed_Bronx,neighbourhood_group_cleansed_Brooklyn,neighbourhood_group_cleansed_Manhattan,neighbourhood_group_cleansed_Queens,neighbourhood_group_cleansed_Staten Island,property_type_Aparthotel,property_type_Apartment,property_type_Barn,property_type_Bed and breakfast,property_type_Boat
0,0,0,1,0,0,0,1,0,0,0
1,0,1,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,1,0,0,0
3,0,0,1,0,0,0,1,0,0,0
4,0,0,1,0,0,0,1,0,0,0
5,0,0,1,0,0,0,1,0,0,0
6,0,1,0,0,0,0,0,0,0,0
7,0,1,0,0,0,0,1,0,0,0
8,0,0,1,0,0,0,1,0,0,0
9,0,1,0,0,0,0,0,0,0,0


In [144]:
# check dimensions
print(y.shape)
X.shape

(28268,)


(28268, 68)

### Create train/test split

In [149]:
# covert to numpy arrays
y = np.array(y)
X = np.array(X)

In [150]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size = 0.7, random_state = 123)

In [151]:
# check dimensions of resulting split
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (19787, 68)
Training Labels Shape: (19787,)
Testing Features Shape: (8481, 68)
Testing Labels Shape: (8481,)


### Training the model

In [102]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 123)

# Train the model on training data
rf.fit(X_train, y_train);

### Generating predictions on the test set

In [104]:
# Use the forest's predict method on the test data
predictions = rf.predict(X_test)

# Calculate the absolute errors
errors = abs(predictions - y_test)
# Print out the mean absolute error (mae)
print(round(np.mean(errors), 2))

42.36


In [105]:
errors.head()

4389     52.986958
347       5.149361
17668    20.517683
12429    76.884415
20193    12.989441
Name: price, dtype: float64