In [10]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn import metrics
import pickle

In [11]:
df_full = pd.read_hdf('data/df_full.h5')
df_full.head()

Unnamed: 0,id,date,available,price,minimum_nights,maximum_nights,host_id,host_response_time,host_is_superhost,host_listings_count,...,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,review,month
0,2595,2022-06-04,0,225.0,30,1125,2845.0,a few days or more,0.0,6.0,...,4.72,4.62,4.76,4.79,4.86,4.41,0.0,3.0,0 11/21/2009\n1 5/28/2009\n2 ...,Jun
1,2595,2022-06-05,0,225.0,30,1125,2845.0,a few days or more,0.0,6.0,...,4.72,4.62,4.76,4.79,4.86,4.41,0.0,3.0,0 11/21/2009\n1 5/28/2009\n2 ...,Jun
2,2595,2022-06-06,0,225.0,30,1125,2845.0,a few days or more,0.0,6.0,...,4.72,4.62,4.76,4.79,4.86,4.41,0.0,3.0,0 11/21/2009\n1 5/28/2009\n2 ...,Jun
3,2595,2022-06-07,0,225.0,30,1125,2845.0,a few days or more,0.0,6.0,...,4.72,4.62,4.76,4.79,4.86,4.41,0.0,3.0,0 11/21/2009\n1 5/28/2009\n2 ...,Jun
4,2595,2022-06-08,0,225.0,30,1125,2845.0,a few days or more,0.0,6.0,...,4.72,4.62,4.76,4.79,4.86,4.41,0.0,3.0,0 11/21/2009\n1 5/28/2009\n2 ...,Jun


In [12]:
df_full.shape

(1036165, 36)

In [13]:
Bronx = df_full.copy()
Bronx = Bronx[Bronx['neighbourhood_group_cleansed'] == 'Bronx']
Bronx.shape

(17520, 36)

We will first make a baseline:

In [14]:
pred = Bronx['price']
m = pred.median()
print(m)
pred = pred.map(lambda x: m)
pred.head()

85.0


32850    85.0
32851    85.0
32852    85.0
32853    85.0
32854    85.0
Name: price, dtype: float64

In [15]:
metrics.mean_squared_error(Bronx['price'], pred)

1481.3062785388129

In [16]:
np.sqrt(metrics.mean_squared_error(Bronx['price'], pred))

38.48774192569386

In [17]:
residuals = Bronx['price'] - pred
np.abs(residuals).mean()

29.70433789954338

Next let's export the data

In [18]:
Bronx.to_hdf(r'data/Bronx.h5', key='stage', mode='w') #run twice

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->Index(['host_response_time', 'host_verifications', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'property_type', 'room_type',
       'amenities', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'review', 'month'],
      dtype='object')]

  pytables.to_hdf(


Make dummies

In [19]:
Bronx_dummies = pd.get_dummies(Bronx, columns=['host_response_time', 'room_type'])

In [20]:
Bronx_dummies.corr()['price'].sort_values()

room_type_Private room                  -0.507650
latitude                                -0.199974
host_id                                 -0.138802
id                                      -0.136291
room_type_Shared room                   -0.098671
host_response_time_unknown              -0.097801
instant_bookable                        -0.077536
availability_365                        -0.055791
available                               -0.043917
has_availability                        -0.036557
host_response_time_within a day         -0.028084
number_of_reviews                       -0.018393
host_response_time_within an hour        0.034249
minimum_nights                           0.037796
host_response_time_a few days or more    0.049188
host_response_time_within a few hours    0.070533
host_listings_count                      0.095179
calculated_host_listings_count           0.114556
host_identity_verified                   0.118757
longitude                                0.157628


In [21]:
def tts_borough(df, xvars, dge):
    X = df[xvars]
    y = df['price']
    poly = PolynomialFeatures(include_bias = False, degree = dge)
    X_poly = poly.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state=42)
    return X_train, X_test, y_train, y_test

In [22]:
xvars = ['room_type_Entire home/apt', 'accommodates', 'room_type_Private room', 'maximum_nights', 'latitude', 'beds', 'host_is_superhost', 'longitude']
X_train, X_test, y_train, y_test = tts_borough(Bronx_dummies, xvars, 2)

Let's make a linear regression model


In [23]:
lr = LinearRegression()

In [24]:
cross_val_score(lr, X_train, y_train, cv = 5).mean()

0.856677176173981

In [25]:
lr.fit(X_train, y_train)

LinearRegression()

In [26]:
prediction = lr.predict(X_test)

The function is

In [27]:
print(f'Model intercept: {lr.intercept_}')
print(f'Model coefficient values: {lr.coef_}')

Model intercept: 3924704.4897128073
Model coefficient values: [-1.41110287e+08  6.20518929e+04 -4.70753150e+07 -1.89820692e+05
  6.08636337e+05 -3.74861515e+04  2.95851988e+04 -9.86473706e+05
  4.70847999e+07  3.57393665e+04  1.54340720e-02  1.89632602e+05
  4.96751420e+05 -1.39529480e+04  1.47948775e+04 -8.43540016e+05
  2.46119009e+00  3.57967582e+04 -8.03335898e-02 -3.51459757e+02
  1.45920470e+01  4.65303669e+01  1.12952627e+03 -4.70753151e+07
  1.89632506e+05  4.99493699e+05 -1.40484385e+04  1.47903196e+04
 -8.43718119e+05  8.84133239e-05  1.60837269e+00  3.89317370e-03
 -4.50966620e-02 -1.65844170e+00 -1.92829277e+04  8.80258313e+02
  1.25416962e+03 -6.29440339e+03 -1.91046786e+01 -1.28613380e+02
 -2.11344131e+02  2.95851977e+04  1.69363547e+03 -1.41207075e+04]


The R^2 value is

In [28]:
print(f'Training R2: {lr.score(X_train, y_train)}')
print(f'Testing R2: {lr.score(X_test, y_test)}')

Training R2: 0.8571985277314411
Testing R2: 0.8578894589148285


The mean squared error is


In [29]:
metrics.mean_squared_error(y_test, prediction)

208.24912690807247

And the root mean squered error is

In [30]:
np.sqrt(metrics.mean_squared_error(y_test, prediction))

14.430839438787768

How about the mean of residuals


In [31]:
residuals = y_test - prediction
np.abs(residuals).mean()

10.531000612429917

This model is very good. Let's see if we can do better with decision tree


In [32]:
xvars = ['room_type_Entire home/apt', 'accommodates', 'room_type_Private room', 'maximum_nights', 'latitude']
X = Bronx_dummies[xvars]
y = Bronx_dummies['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [33]:
#I tweeked this until I got the best result
dt = DecisionTreeRegressor(random_state=42)

In [34]:
dt.fit(X_train, y_train)

DecisionTreeRegressor(random_state=42)

In [35]:
prediction = dt.predict(X_test)

In [36]:
print(f'Score on training set: {dt.score(X_train, y_train)}')
print(f'Score on testing set: {dt.score(X_test, y_test)}')

Score on training set: 0.9888160419467622
Score on testing set: 0.9890075801431886


In [37]:
metrics.mean_squared_error(y_test, prediction)

16.108318357721032

In [38]:
np.sqrt(metrics.mean_squared_error(y_test, prediction))

4.013516956202008

In [39]:
residuals = y_test - prediction
np.abs(residuals).mean()

2.1321772593829293

This model is great. Let's look at bagging model now.

In [40]:
xvars = ['room_type_Entire home/apt', 'accommodates', 'room_type_Private room', 'maximum_nights', 'latitude']
X = Bronx_dummies[xvars]
y = Bronx_dummies['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [41]:
#From lesson 6.02
bag = BaggingRegressor(random_state = 42)
bag.fit(X_train, y_train)

BaggingRegressor(random_state=42)

In [42]:
prediction = bag.predict(X_test)

In [43]:
print(f'Score on training set: {bag.score(X_train, y_train)}')
print(f'Score on testing set: {bag.score(X_test, y_test)}')

Score on training set: 0.9888120445955765
Score on testing set: 0.9889705258511065


In [44]:
metrics.mean_squared_error(y_test, prediction)

16.16261780599123

In [45]:
np.sqrt(metrics.mean_squared_error(y_test, prediction))

4.020275837052879

In [46]:
residuals = y_test - prediction
np.abs(residuals).mean()

2.127471242963113

___

RandomForest:

In [47]:
xvars = ['room_type_Entire home/apt', 'accommodates', 'room_type_Private room', 'maximum_nights', 'latitude']
X = Bronx_dummies[xvars]
y = Bronx_dummies['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [48]:
rf = RandomForestRegressor(n_estimators = 100)

In [49]:
rf.fit(X_train,y_train)

RandomForestRegressor()

In [50]:
prediction = rf.predict(X_test)

In [51]:
print(f'Score on training set: {rf.score(X_train, y_train)}')
print(f'Score on testing set: {rf.score(X_test, y_test)}')

Score on training set: 0.988815684910948
Score on testing set: 0.9890069237984309


In [52]:
metrics.mean_squared_error(y_test, prediction)

16.109280166899293

In [53]:
np.sqrt(metrics.mean_squared_error(y_test, prediction))

4.013636775656125

In [54]:
residuals = y_test - prediction
np.abs(residuals).mean()

2.12941708540968

ExtraTrees:

In [55]:
et = ExtraTreesRegressor(n_estimators = 100)

In [56]:
et.fit(X_train, y_train)

ExtraTreesRegressor()

In [57]:
prediction = et.predict(X_test)

In [58]:
print(f'Score on training set: {et.score(X_train, y_train)}')
print(f'Score on testing set: {et.score(X_test, y_test)}')

Score on training set: 0.9888160419467622
Score on testing set: 0.9890075801431886


In [59]:
metrics.mean_squared_error(y_test, prediction)

16.108318357721032

In [60]:
np.sqrt(metrics.mean_squared_error(y_test, prediction))

4.013516956202008

In [61]:
residuals = y_test - prediction
np.abs(residuals).mean()

2.1321772593829396

Decision Tree and Extra Trees are the best models here. So I'll go with the decision tree for the streamlit app.


We will pickle this object to use for our streamlit app

In [49]:
with open('models/bronx_pickle.pkl', 'wb') as f:
    pickle.dump(dt, f)