In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn import metrics
import pickle

In [2]:
df_full = pd.read_hdf('../data/df_full.h5')
df_full.head()

Unnamed: 0,id,date,available,price,minimum_nights,maximum_nights,host_id,host_response_time,host_is_superhost,host_listings_count,...,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,review,month
0,2595,2022-06-04,0,225.0,30,1125,2845.0,a few days or more,0.0,6.0,...,4.72,4.62,4.76,4.79,4.86,4.41,0.0,3.0,0 11/21/2009\n1 5/28/2009\n2 ...,Jun
1,2595,2022-06-05,0,225.0,30,1125,2845.0,a few days or more,0.0,6.0,...,4.72,4.62,4.76,4.79,4.86,4.41,0.0,3.0,0 11/21/2009\n1 5/28/2009\n2 ...,Jun
2,2595,2022-06-06,0,225.0,30,1125,2845.0,a few days or more,0.0,6.0,...,4.72,4.62,4.76,4.79,4.86,4.41,0.0,3.0,0 11/21/2009\n1 5/28/2009\n2 ...,Jun
3,2595,2022-06-07,0,225.0,30,1125,2845.0,a few days or more,0.0,6.0,...,4.72,4.62,4.76,4.79,4.86,4.41,0.0,3.0,0 11/21/2009\n1 5/28/2009\n2 ...,Jun
4,2595,2022-06-08,0,225.0,30,1125,2845.0,a few days or more,0.0,6.0,...,4.72,4.62,4.76,4.79,4.86,4.41,0.0,3.0,0 11/21/2009\n1 5/28/2009\n2 ...,Jun


In [3]:
df_full.shape

(1036165, 36)

In [4]:
Staten_Island = df_full.copy()
Staten_Island = Staten_Island[Staten_Island['neighbourhood_group_cleansed'] == 'Staten Island']
Staten_Island.shape

(5840, 36)

We will first make a baseline:

In [5]:
pred = Staten_Island['price']
m = pred.median()
print(m)
pred = pred.map(lambda x: m)
pred.head()

75.0


29565    75.0
29566    75.0
29567    75.0
29568    75.0
29569    75.0
Name: price, dtype: float64

In [6]:
metrics.mean_squared_error(Staten_Island['price'], pred)

14305.233561643836

In [7]:
np.sqrt(metrics.mean_squared_error(Staten_Island['price'], pred))

119.60448804975437

In [8]:
residuals = Staten_Island['price'] - pred
np.abs(residuals).mean()

55.72054794520548

Next let's export the data

In [9]:
Staten_Island.to_hdf(r'../data/Staten_Island.h5', key='stage', mode='w') #run twice

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->Index(['host_response_time', 'host_verifications', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'property_type', 'room_type',
       'amenities', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'review', 'month'],
      dtype='object')]

  pytables.to_hdf(


Make dummies

In [10]:
Staten_Island_dummies = pd.get_dummies(Staten_Island, columns=['host_response_time', 'room_type'])

In [11]:
Staten_Island_dummies.corr()['price'].sort_values()

latitude                                -0.689581
room_type_Private room                  -0.529195
longitude                               -0.515542
host_response_time_unknown              -0.239379
instant_bookable                        -0.142014
maximum_nights                          -0.139553
host_response_time_within a day         -0.123903
minimum_nights                          -0.114987
number_of_reviews                       -0.053676
calculated_host_listings_count          -0.047242
id                                      -0.028807
host_id                                  0.018747
host_is_superhost                        0.022879
host_response_time_within a few hours    0.064620
host_listings_count                      0.099251
host_has_profile_pic                     0.102953
available                                0.215412
host_response_time_within an hour        0.216848
has_availability                         0.264027
availability_365                         0.286863


We see much more correlation in Staten Island variables then with Manhattan ones

In [12]:
def tts_borough(df, xvars, dge):
    X = df[xvars]
    y = df['price']
    poly = PolynomialFeatures(include_bias = False, degree = dge)
    X_poly = poly.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state=42)
    return X_train, X_test, y_train, y_test

In [13]:
xvars = ['beds', 'accommodates', 'latitude', 'room_type_Entire home/apt', 'room_type_Private room', 'longitude']
X_train, X_test, y_train, y_test = tts_borough(Staten_Island_dummies, xvars, 2)

Let's make a linear regression model


In [14]:
lr = LinearRegression()

In [15]:
cross_val_score(lr, X_train, y_train, cv = 5).mean()

0.9730873678018096

Very good cross_val_score


In [16]:
lr.fit(X_train, y_train)

LinearRegression()

In [17]:
prediction = lr.predict(X_test)

The function is

In [18]:
print(f'Model intercept: {lr.intercept_}')
print(f'Model coefficient values: {lr.coef_}')

Model intercept: 39100712601091.34
Model coefficient values: [-1.33634782e+11 -1.20139656e+11  2.49563574e+10  3.64554119e+10
 -3.59565453e+10 -4.63510027e+09  7.86418896e+07 -1.38274257e+08
  4.50920104e+09  5.11453354e+10  5.10366747e+10  1.35661822e+09
  7.76550408e+07  4.75339035e+08 -4.36894494e+10 -4.36535580e+10
 -1.94695026e+09 -5.54421477e+09 -8.93706749e+11 -8.90508553e+11
 -1.76907124e+10  3.93241117e+10  0.00000000e+00  5.78818320e+11
 -3.94564617e+10  5.78531039e+11 -9.90881436e+08]


The R^2 value is


In [19]:
print(f'Training R2: {lr.score(X_train, y_train)}')
print(f'Testing R2: {lr.score(X_test, y_test)}')

Training R2: 0.9733313324844002
Testing R2: 0.9744491143128797


The mean squared error is


In [20]:
metrics.mean_squared_error(y_test, prediction)

337.1530138825717

And the root mean squered error is


In [21]:
np.sqrt(metrics.mean_squared_error(y_test, prediction))

18.361726876374448

How about the mean of residuals


In [22]:
residuals = y_test - prediction
np.abs(residuals).mean()

7.564854452054795

The linear regression model is really good. But let's see if we can do even better! We will try with Decision Tree

In [23]:
xvars = ['beds', 'accommodates', 'latitude']
X = Staten_Island_dummies[xvars]
y = Staten_Island_dummies['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [24]:
#I tweeked this until I got the best result
dt = DecisionTreeRegressor(random_state=42)

In [25]:
dt.fit(X_train, y_train)

DecisionTreeRegressor(random_state=42)

In [26]:
prediction = dt.predict(X_test)

In [27]:
print(f'Score on training set: {dt.score(X_train, y_train)}')
print(f'Score on testing set: {dt.score(X_test, y_test)}')

Score on training set: 0.9733315022216071
Score on testing set: 0.9744500709956425


In [28]:
metrics.mean_squared_error(y_test, prediction)

337.1403901136449

In [29]:
np.sqrt(metrics.mean_squared_error(y_test, prediction))

18.361383120931954

In [30]:
residuals = y_test - prediction
np.abs(residuals).mean()

7.5585589891145

Not much of a difference. Let's try the bagging model.

In [31]:
xvars = ['beds', 'accommodates', 'latitude']
X = Staten_Island_dummies[xvars]
y = Staten_Island_dummies['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [32]:
#From lesson 6.02
bag = BaggingRegressor(random_state = 42)
bag.fit(X_train, y_train)

BaggingRegressor(random_state=42)

In [33]:
prediction = bag.predict(X_test)

In [34]:
print(f'Score on training set: {bag.score(X_train, y_train)}')
print(f'Score on testing set: {bag.score(X_test, y_test)}')

Score on training set: 0.9733123505922943
Score on testing set: 0.9744209033250809


In [35]:
metrics.mean_squared_error(y_test, prediction)

337.52526788885

In [36]:
np.sqrt(metrics.mean_squared_error(y_test, prediction))

18.37186076283102

In [37]:
residuals = y_test - prediction
np.abs(residuals).mean()

7.636950988061424

___

RandomForest:

In [38]:
xvars = ['beds', 'accommodates', 'latitude']
X = Staten_Island_dummies[xvars]
y = Staten_Island_dummies['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [39]:
rf = RandomForestRegressor(n_estimators = 100)

In [40]:
rf.fit(X_train,y_train)

RandomForestRegressor()

In [41]:
prediction = rf.predict(X_test)

In [42]:
print(f'Score on training set: {rf.score(X_train, y_train)}')
print(f'Score on testing set: {rf.score(X_test, y_test)}')

Score on training set: 0.9733307987557265
Score on testing set: 0.9744517595830207


In [43]:
metrics.mean_squared_error(y_test, prediction)

337.11810860329985

In [44]:
np.sqrt(metrics.mean_squared_error(y_test, prediction))

18.36077636167109

In [45]:
residuals = y_test - prediction
np.abs(residuals).mean()

7.562081334842878

ExtraTrees:

In [46]:
xvars = ['beds', 'accommodates', 'latitude']
X = Staten_Island_dummies[xvars]
y = Staten_Island_dummies['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [47]:
et = ExtraTreesRegressor(n_estimators = 100)

In [48]:
et.fit(X_train, y_train)

ExtraTreesRegressor()

In [49]:
prediction = et.predict(X_test)

In [50]:
print(f'Score on training set: {et.score(X_train, y_train)}')
print(f'Score on testing set: {et.score(X_test, y_test)}')

Score on training set: 0.9733315022216071
Score on testing set: 0.9744500709956425


In [51]:
metrics.mean_squared_error(y_test, prediction)

337.14039011364497

In [52]:
np.sqrt(metrics.mean_squared_error(y_test, prediction))

18.361383120931958

In [53]:
residuals = y_test - prediction
np.abs(residuals).mean()

7.558558989114516

Decision Tree and Extra Trees are the best models here. So I'll go with the decision tree for the streamlit app.

We will pickle this object to use for our streamlit app


In [54]:
with open('pickled/staten_island_pickle.pkl', 'wb') as f:
    pickle.dump(dt, f)