In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn import metrics
import pickle

In [2]:
df_full = pd.read_hdf('data/df_full.h5')
df_full.head()

Unnamed: 0,id,date,available,price,minimum_nights,maximum_nights,host_id,host_response_time,host_is_superhost,host_listings_count,...,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,review,month
0,2595,2022-06-04,0,225.0,30,1125,2845.0,a few days or more,0.0,6.0,...,4.72,4.62,4.76,4.79,4.86,4.41,0.0,3.0,0 11/21/2009\n1 5/28/2009\n2 ...,Jun
1,2595,2022-06-05,0,225.0,30,1125,2845.0,a few days or more,0.0,6.0,...,4.72,4.62,4.76,4.79,4.86,4.41,0.0,3.0,0 11/21/2009\n1 5/28/2009\n2 ...,Jun
2,2595,2022-06-06,0,225.0,30,1125,2845.0,a few days or more,0.0,6.0,...,4.72,4.62,4.76,4.79,4.86,4.41,0.0,3.0,0 11/21/2009\n1 5/28/2009\n2 ...,Jun
3,2595,2022-06-07,0,225.0,30,1125,2845.0,a few days or more,0.0,6.0,...,4.72,4.62,4.76,4.79,4.86,4.41,0.0,3.0,0 11/21/2009\n1 5/28/2009\n2 ...,Jun
4,2595,2022-06-08,0,225.0,30,1125,2845.0,a few days or more,0.0,6.0,...,4.72,4.62,4.76,4.79,4.86,4.41,0.0,3.0,0 11/21/2009\n1 5/28/2009\n2 ...,Jun


In [3]:
df_full.shape

(1036165, 36)

In [4]:
Manhattan = df_full.copy()
Manhattan = Manhattan[Manhattan['neighbourhood_group_cleansed'] == 'Manhattan']
Manhattan.shape

(469320, 36)

We will first make a baseline:

In [5]:
pred = Manhattan['price']
m = pred.median()
print(m)
pred = pred.map(lambda x: m)
pred.head()

150.0


0    150.0
1    150.0
2    150.0
3    150.0
4    150.0
Name: price, dtype: float64

In [6]:
metrics.mean_squared_error(Manhattan['price'], pred)

293408.1920075854

In [7]:
np.sqrt(metrics.mean_squared_error(Manhattan['price'], pred))

541.6716643942024

In [8]:
residuals = Manhattan['price'] - pred
np.abs(residuals).mean()

126.11349825279127

Next let's export the data

In [9]:
Manhattan.to_hdf(r'data/Manhattan.h5', key='stage', mode='w') #run twice

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->Index(['host_response_time', 'host_verifications', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'property_type', 'room_type',
       'amenities', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'review', 'month'],
      dtype='object')]

  pytables.to_hdf(


As we saw in the EDA section, we should make dummies of host_response_time and room_type


In [10]:
Manhattan_dummies = pd.get_dummies(Manhattan, columns=['host_response_time', 'room_type'])

In [11]:
Manhattan_dummies.corr()['price'].sort_values()

room_type_Private room                  -0.128061
longitude                               -0.107898
latitude                                -0.096309
has_availability                        -0.076666
number_of_reviews                       -0.054570
host_response_time_within a day         -0.041290
host_identity_verified                  -0.034939
host_is_superhost                       -0.019604
calculated_host_listings_count          -0.016046
host_response_time_within an hour       -0.014874
instant_bookable                        -0.012008
host_response_time_within a few hours   -0.008531
host_id                                  0.000784
host_has_profile_pic                     0.001274
id                                       0.001468
room_type_Shared room                    0.003272
host_listings_count                      0.004914
available                                0.015329
availability_365                         0.018623
host_response_time_unknown               0.026754


Let's create a model. I'll keep adding features until the model doesn't get better. (Note: I did not save all of them what you are seeing is the best model)

In [12]:
def tts_borough(df, xvars, dge):
    X = df[xvars]
    y = df['price']
    poly = PolynomialFeatures(include_bias = False, degree = dge)
    X_poly = poly.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state=42)
    return X_train, X_test, y_train, y_test

In [13]:
xvars = ['accommodates', 'beds', 'room_type_Private room', 'room_type_Entire home/apt', 'longitude', 'latitude', 'has_availability']
X_train, X_test, y_train, y_test = tts_borough(Manhattan_dummies, xvars, 2)

Let's make a linear regression model


In [14]:
lr = LinearRegression()

In [15]:
cross_val_score(lr, X_train, y_train, cv = 5).mean()

0.14992254254323414

In [16]:
lr.fit(X_train, y_train)

LinearRegression()

In [17]:
prediction = lr.predict(X_test)

The function is


In [18]:
print(f'Model intercept: {lr.intercept_}')
print(f'Model coefficient values: {lr.coef_}')

Model intercept: -531306042.6143767
Model coefficient values: [-1.01403997e+05  1.74468510e+05  1.10669162e+04  6.41183001e+04
 -1.17780751e+07  4.69007473e+06 -5.17404389e+04  2.98247609e+00
 -4.54046484e+00 -1.33172658e+02 -8.63614122e+01 -1.27509522e+03
  1.75022759e+02  9.74183237e+01  5.66306659e+00  1.85526142e+02
  1.43533249e+02  1.68401727e+03 -1.22127733e+03 -2.83689769e+02
  1.10669163e+04  1.55542511e-06  4.22198899e+02  2.21003791e+02
  1.46667907e+02  6.41183005e+04  1.46014147e+03 -4.96444605e+02
  6.02715899e+01 -6.91263456e+04  3.80068200e+04 -7.46493526e+02
 -2.30322881e+04  1.18339523e+03 -5.17404389e+04]


The R^2 value is

In [19]:
print(f'Training R2: {lr.score(X_train, y_train)}')
print(f'Testing R2: {lr.score(X_test, y_test)}')

Training R2: 0.141631773839774
Testing R2: 0.19491185140151768


The mean squared error is


In [20]:
metrics.mean_squared_error(y_test, prediction)

175396.3974895145

And the root mean squered error is


In [21]:
np.sqrt(metrics.mean_squared_error(y_test, prediction))

418.803530894278

How about the mean of residuals


In [22]:
residuals = y_test - prediction
np.abs(residuals).mean()

117.34281586432186

This model isn't very good. There error is high and doesn't count for much of the data. Let's try the decision tree regressor model

In [23]:
Manhattan_dummies.corr()['price'].sort_values()

room_type_Private room                  -0.128061
longitude                               -0.107898
latitude                                -0.096309
has_availability                        -0.076666
number_of_reviews                       -0.054570
host_response_time_within a day         -0.041290
host_identity_verified                  -0.034939
host_is_superhost                       -0.019604
calculated_host_listings_count          -0.016046
host_response_time_within an hour       -0.014874
instant_bookable                        -0.012008
host_response_time_within a few hours   -0.008531
host_id                                  0.000784
host_has_profile_pic                     0.001274
id                                       0.001468
room_type_Shared room                    0.003272
host_listings_count                      0.004914
available                                0.015329
availability_365                         0.018623
host_response_time_unknown               0.026754


Again, I did not save all of them what you are seeing is the best model


In [24]:
xvars = ['accommodates', 'beds', 'room_type_Private room', 'room_type_Entire home/apt', 'longitude', 'latitude']
X = Manhattan_dummies[xvars]
y = Manhattan_dummies['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [25]:
#I tweeked this until I got the best result
dt = DecisionTreeRegressor(random_state=42, max_depth=35, min_samples_split=2, min_samples_leaf=2)

In [26]:
dt.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=35, min_samples_leaf=2, random_state=42)

In [27]:
prediction = dt.predict(X_test)

In [28]:
print(f'Score on training set: {dt.score(X_train, y_train)}')
print(f'Score on testing set: {dt.score(X_test, y_test)}')

Score on training set: 0.47327498499947585
Score on testing set: 0.6028663936001599


In [29]:
metrics.mean_squared_error(y_test, prediction)

86519.4749243413

In [30]:
np.sqrt(metrics.mean_squared_error(y_test, prediction))

294.14192989837625

In [31]:
residuals = y_test - prediction
np.abs(residuals).mean()

13.617887454514777

This model is definitly better than the linear regression model. However, it is somewhat underfit.

Let's try bagging it


In [32]:
xvars = ['accommodates', 'beds', 'room_type_Private room', 'room_type_Entire home/apt', 'longitude', 'latitude']
X = Manhattan_dummies[xvars]
y = Manhattan_dummies['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [33]:
#From lesson 6.02
bag = BaggingRegressor(random_state = 42)
bag.fit(X_train, y_train)

BaggingRegressor(random_state=42)

In [34]:
prediction = bag.predict(X_test)

In [35]:
print(f'Score on training set: {bag.score(X_train, y_train)}')
print(f'Score on testing set: {bag.score(X_test, y_test)}')

Score on training set: 0.47280214893033623
Score on testing set: 0.595230603765355


In [36]:
metrics.mean_squared_error(y_test, prediction)

88183.00708705328

In [37]:
np.sqrt(metrics.mean_squared_error(y_test, prediction))

296.9562376631501

In [38]:
residuals = y_test - prediction
np.abs(residuals).mean()

13.903125737731443

Not an improvement.

Let's try a randomforest and an extratrees regressor


In [39]:
xvars = ['accommodates', 'beds', 'room_type_Private room', 'room_type_Entire home/apt', 'longitude', 'latitude']
X = Manhattan_dummies[xvars]
y = Manhattan_dummies['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [40]:
rf = RandomForestRegressor(n_estimators = 100)

In [41]:
rf.fit(X_train,y_train)

RandomForestRegressor()

In [42]:
prediction = rf.predict(X_test)

In [43]:
print(f'Score on training set: {rf.score(X_train, y_train)}')
print(f'Score on testing set: {rf.score(X_test, y_test)}')

Score on training set: 0.4732710850238875
Score on testing set: 0.6033909604476098


In [44]:
metrics.mean_squared_error(y_test, prediction)

86405.19286038929

In [45]:
np.sqrt(metrics.mean_squared_error(y_test, prediction))

293.94760223616265

In [46]:
residuals = y_test - prediction
np.abs(residuals).mean()

13.603366390914191

In [47]:
#Extratrees
xvars = ['accommodates', 'beds', 'room_type_Private room', 'room_type_Entire home/apt', 'longitude', 'latitude']
X = Manhattan_dummies[xvars]
y = Manhattan_dummies['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [48]:
et = ExtraTreesRegressor(n_estimators = 100)

In [49]:
et.fit(X_train, y_train)

ExtraTreesRegressor()

In [50]:
prediction = et.predict(X_test)

In [51]:
print(f'Score on training set: {et.score(X_train, y_train)}')
print(f'Score on testing set: {et.score(X_test, y_test)}')

Score on training set: 0.47327498499947573
Score on testing set: 0.6028663936001599


In [52]:
metrics.mean_squared_error(y_test, prediction)

86519.4749243413

In [53]:
np.sqrt(metrics.mean_squared_error(y_test, prediction))

294.14192989837625

In [54]:
residuals = y_test - prediction
np.abs(residuals).mean()

13.617887454514777

Bagging seems to be the best models. I will go with bagging for the streamlit app.

We will pickle this object to use for our streamlit app

In [56]:
with open('models/manhattan_pickle.pkl', 'wb') as f:
    pickle.dump(bag, f)