# Tree Models
### Author: Jainam Mehta

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [83]:
df = pd.read_csv("Data/data.csv")
df.head()

Unnamed: 0,id,hos2_is_superhos2,host_listings_count,neighbourhood_cleansed,neighbourhood_group_cleansed,city,zipcode,property_type,room_type,accommodates,...,maximum_nights_avg_ntm,number_of_reviews,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
0,2595,1.0,6.0,Midtown,Manhattan,New York,10018,Apartment,Entire home/apt,1,...,1125.0,48,7.0,94.0,9.0,9.0,10.0,10.0,10.0,9.0
1,3831,1.0,1.0,Clinton Hill,Brooklyn,Brooklyn,11238,Guest suite,Entire home/apt,3,...,730.0,295,75.0,90.0,9.0,9.0,10.0,9.0,10.0,9.0
2,5099,1.0,1.0,Murray Hill,Manhattan,New York,10016,Apartment,Entire home/apt,2,...,21.0,78,8.0,90.0,10.0,9.0,10.0,10.0,10.0,9.0
3,5178,1.0,1.0,Hell's Kitchen,Manhattan,New York,10019,Apartment,Private room,2,...,14.0,454,47.0,84.0,9.0,7.0,9.0,9.0,10.0,8.0
4,5238,2.0,4.0,Chinatown,Manhattan,New York,10002,Apartment,Entire home/apt,3,...,1125.0,161,9.0,94.0,10.0,9.0,10.0,10.0,9.0,9.0


In [84]:
# Descriptive statistics for each column
df.describe()

Unnamed: 0,id,hos2_is_superhos2,host_listings_count,accommodates,bathrooms,bedrooms,beds,square_feet,price,guests_included,...,maximum_nights_avg_ntm,number_of_reviews,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
count,29351.0,28991.0,28991.0,29351.0,29334.0,29324.0,29323.0,256.0,29351.0,29351.0,...,29351.0,29351.0,29351.0,28986.0,28982.0,28982.0,28982.0,28982.0,28982.0,28982.0
mean,23081540.0,1.308268,8.334794,2.950598,1.133633,1.185548,1.597074,666.296875,129.492249,1.637184,...,10026.02,39.444993,15.849579,94.020182,9.635808,9.327755,9.758505,9.750776,9.608067,9.393072
std,12191660.0,0.461786,55.945113,1.876624,0.3909,0.720719,1.144621,476.330391,85.673312,1.208648,...,660842.8,56.69436,18.636539,8.069696,0.794875,0.97616,0.686061,0.725659,0.699269,0.879868
min,2595.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,10.0,1.0,...,1.0,1.0,1.0,20.0,2.0,2.0,2.0,2.0,2.0,2.0
25%,13694050.0,1.0,1.0,2.0,1.0,1.0,1.0,315.0,68.0,1.0,...,29.0,5.0,3.0,92.0,9.0,9.0,10.0,10.0,9.0,9.0
50%,25230900.0,1.0,1.0,2.0,1.0,1.0,1.0,700.0,100.0,1.0,...,365.0,17.0,9.0,96.0,10.0,10.0,10.0,10.0,10.0,10.0
75%,33717730.0,2.0,3.0,4.0,1.0,1.0,2.0,900.0,168.0,2.0,...,1125.0,50.0,23.0,99.0,10.0,10.0,10.0,10.0,10.0,10.0
max,40565660.0,2.0,1767.0,16.0,7.0,21.0,26.0,2400.0,500.0,16.0,...,49481220.0,675.0,407.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0


In [93]:
# taking a subset of the data e.g. room features

# drop NaN values
df = df[['price', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds']]
df = df.dropna()

In [94]:
y = df['price']
X = df[['property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds']]

X = pd.get_dummies(X)
X.head()

Unnamed: 0,accommodates,bathrooms,bedrooms,beds,property_type_Aparthotel,property_type_Apartment,property_type_Barn,property_type_Bed and breakfast,property_type_Boat,property_type_Boutique hotel,...,property_type_Serviced apartment,property_type_Tent,property_type_Tiny house,property_type_Townhouse,property_type_Villa,property_type_Yurt,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room
0,1,1.0,0.0,1.0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,3,1.0,1.0,4.0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,2,1.0,1.0,1.0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,2,1.0,1.0,1.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,3,1.0,1.0,2.0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [99]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size = 0.7, random_state = 123)

In [96]:
X['beds'].isnull().any()

False

In [101]:
# Use numpy to convert to arrays

X_train = np.array(X_train)
y_train = np.array(y_train)

In [102]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 123)

# Train the model on training data
rf.fit(X_train, y_train);

In [104]:
# Use the forest's predict method on the test data
predictions = rf.predict(X_test)

# Calculate the absolute errors
errors = abs(predictions - y_test)
# Print out the mean absolute error (mae)
print(round(np.mean(errors), 2))

42.36


In [105]:
errors.head()

4389     52.986958
347       5.149361
17668    20.517683
12429    76.884415
20193    12.989441
Name: price, dtype: float64