In [92]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import sys
from patsy import dmatrices
from sklearn.model_selection import train_test_split
from mizani.formatters import percent_format
from plotnine import *
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from skranger.ensemble import RangerForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.inspection import plot_partial_dependence
from sklearn.inspection import partial_dependence
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from py_helper_functions import *

ModuleNotFoundError: No module named 'py_helper_functions'

In [52]:
data=pd.read_csv('listings_mad_cleaned.csv')

In [53]:
to_filter=data.isna().sum()
to_filter[to_filter>0].index

Index([], dtype='object')

In [54]:
def count_missing_values(df):
    return df.isna().sum()[df.isna().sum()>0]

In [55]:
count_missing_values(data)

Series([], dtype: int64)

In [56]:
# Sample definition and preparation ---------------------------------------

# We focus on 2-6 person places
data = data.query('n_accommodates < 7 & n_accommodates>1')

In [57]:
# copy a variable - purpose later, see at variable importance
data = data.assign(n_accommodates_copy = data.n_accommodates)

In [58]:
data.describe()

Unnamed: 0.1,Unnamed: 0,n_days_since,n_accommodates,n_beds,n_minimum_nights,n_number_of_reviews,n_review_scores_rating,n_reviews_per_month,id,price2,...,flag_review_scores_rating,flag_reviews_per_month,flag_n_number_of_reviews,ln_days_since,ln_days_since2,ln_days_since3,n_days_since2,n_days_since3,ln_review_scores_rating,n_accommodates_copy
count,6504.0,6504.0,6504.0,6504.0,6504.0,6504.0,6504.0,6504.0,6504.0,6504.0,...,6504.0,6504.0,6504.0,6504.0,6504.0,6504.0,6504.0,6504.0,6504.0,6504.0
mean,7625.597632,1175.422509,3.057349,1.790602,6.120541,57.70449,4.621645,1.519717,27554900.0,62.246771,...,0.0,0.0,0.0,6.659623,45.746862,320.509738,1983628.0,3974277000.0,-inf,3.057349
std,4972.173949,775.952686,1.197645,0.907181,33.447974,84.563481,0.467449,1.648654,14966900.0,23.019882,...,0.0,0.0,0.0,1.181734,13.554089,124.118718,2246171.0,6559185000.0,,1.197645
min,0.0,1.0,2.0,1.0,1.0,1.0,0.0,0.01,6369.0,9.0,...,0.0,0.0,0.0,0.693147,0.480453,0.333025,1.0,1.0,-inf,2.0
25%,3311.75,651.0,2.0,1.0,1.0,5.0,4.5,0.3,16224600.0,45.0,...,0.0,0.0,0.0,6.480045,41.990978,272.103406,423801.0,275894500.0,1.504077,2.0
50%,6988.5,1107.0,3.0,2.0,2.0,22.0,4.73,0.98,27192370.0,63.0,...,0.0,0.0,0.0,7.010312,49.144472,344.518079,1225449.0,1356572000.0,1.553925,3.0
75%,11856.25,1665.0,4.0,2.0,3.0,75.0,4.9,2.2,40191260.0,80.0,...,0.0,0.0,0.0,7.418181,55.029407,408.21809,2772225.0,4615755000.0,1.589235,4.0
max,17554.0,3910.0,6.0,8.0,1125.0,758.0,5.0,24.59,53139100.0,105.0,...,0.0,0.0,0.0,8.271548,68.418513,565.927036,15288100.0,59776470000.0,1.609438,6.0


In [9]:
data_train, data_holdout = train_test_split(data,train_size=0.7, random_state=42)

In [59]:
data.drop(columns=['neighbourhood_cleansed', 'room_type', 'property_type', 'f_property_type',
                  'f_room_type2', 'f_neighbourhood_cleansed', 'f_neighbourhood_group_cleansed',
                 'id','price'], axis=1, inplace=True)

In [60]:
data['f_room_type'] = (data['f_room_type'] == 'Entire home/apt').astype(int)

In [10]:
data_train.shape, data_holdout.shape

((4552, 37), (1952, 37))

In [61]:
# Basic Variables inc neighnourhood
basic_vars = [
    "n_accommodates",
    "n_beds",
    "n_days_since",
    "f_property_type",
    "f_room_type",
    "f_neighbourhood_cleansed",
]

# reviews
reviews = [
    "n_number_of_reviews",
    "flag_n_number_of_reviews",
    "n_review_scores_rating",
    "flag_review_scores_rating",
]

# Dummy variables
amenities = [col for col in data if col.startswith("d_")]

# interactions for the LASSO
# from ch14
X1 = [
    "n_accommodates:f_property_type",
    "f_room_type:f_property_type",


]
# with boroughs
X2 = [
    "f_property_type:f_neighbourhood_cleansed",
    "f_room_type:f_neighbourhood_cleansed",
    "n_accommodates:f_neighbourhood_cleansed",
]

In [18]:
predictors_1 = basic_vars
predictors_2 = basic_vars + reviews + amenities
predictors_E = basic_vars + reviews + amenities + X1 + X2

In [20]:
#rfr = RangerForestRegressor(importance="impurity",seed = 42)

#tune_grid = {"mtry": [5, 7, 9], "min_node_size": [5, 10]}

#rf_random = GridSearchCV(
#    rfr,
#    tune_grid,
#    cv=5,
#    scoring="neg_root_mean_squared_error",
#    verbose=3,
#)

#y, X = dmatrices("price ~ " + " + ".join(predictors_1), data_train)

#rf_model_1 = rf_random.fit(X, y.ravel())

In [62]:
X = data.drop(['price2','ln_review_scores_rating'], axis=1)  # i dropped ln_review_scores_rating as well because there is a inf and NaN that I couldnt figure out how it got there
y = data['price2']  # target
# data standard
ss = StandardScaler()
X_std = ss.fit_transform(X)

# train test split
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.2, random_state=42)

In [65]:
ols_model = LinearRegression().fit(X_train, y_train)
y_hat = ols_model.predict(X_test)
ols_rmse = mean_squared_error(y_test, y_hat, squared=False)

In [66]:
ols_model_coeffs_df = pd.DataFrame(
    ols_model.coef_.tolist(),
    index=X.columns,
    columns=['ols_coefficient'],
).assign(ols_coefficient=lambda x: x.ols_coefficient.round(3))
ols_model_coeffs_df.sort_values('ols_coefficient', inplace=True, ascending=False)

In [74]:
def coef_matrix(X, model):

    coef_matrix = pd.concat([pd.DataFrame(X.columns),pd.DataFrame(np.transpose(model.coef_))], axis = 1)
    coef_matrix.columns = ['variable', 'coefficient']
    coef_matrix = coef_matrix.append({'variable': 'Intercept', 'coefficient': np.asscalar(model.intercept_)}, ignore_index=True)
    return(coef_matrix)

In [75]:
coef_matrix(X,ols_model)



Unnamed: 0,variable,coefficient
0,Unnamed: 0,-0.205593
1,f_room_type,-1.00222
2,n_days_since,0.7052043
3,n_accommodates,-28.17897
4,n_beds,0.7681959
5,n_minimum_nights,0.01294324
6,n_number_of_reviews,-0.0001766503
7,n_review_scores_rating,0.08018273
8,n_reviews_per_month,-0.1137888
9,n_accommodates2,12.12819


In [78]:
# cross validation
ols_model = LinearRegression()
ols_cv_mse = cross_val_score(ols_model, X_std, y, cv=5, scoring='neg_mean_squared_error')
ols_cv_rmse = [np.sqrt(-x) for x in ols_cv_mse]

In [79]:
ols_cv_rmse

[6.411472940403095,
 5.389003294936061,
 5.65552699093339,
 5.205390363655844,
 5.408908745311558]

In [80]:
lasso_model = Lasso(alpha=0.5).fit(X_train, y_train)
y_hat = lasso_model.predict(X_test)
lasso_rmse = mean_squared_error(y_test, y_hat, squared=False)

In [81]:
# cross validation
lasso_model = Lasso(alpha=0.5)
lasso_cv_mse = cross_val_score(lasso_model, X_std, y, cv=5, scoring='neg_mean_squared_error')
lasso_cv_rmse = [np.sqrt(-x) for x in lasso_cv_mse]

In [82]:
lasso_cv_rmse

[5.592685336457046,
 5.4845522243593825,
 5.710252707850093,
 5.304981450220126,
 5.466187005493391]

In [83]:
cart_model = DecisionTreeRegressor().fit(X_train, y_train)
y_hat = cart_model.predict(X_test)
cart_rmse = mean_squared_error(y_test, y_hat, squared=False)

NameError: name 'DecisionTreeRegressor' is not defined

In [85]:
# cross validation
cart_model = DecisionTreeRegressor()
cart_cv_mse = cross_val_score(cart_model, X_std, y, cv=5, scoring='neg_mean_squared_error')
cart_cv_rmse = [np.sqrt(-x) for x in cart_cv_mse]

In [86]:
cart_cv_rmse

[0.04801998046867725, 0.03920814986889116, 0.02772434865007138, 0.0, 0.0]

In [89]:
gbm_model = GradientBoostingRegressor().fit(X_train, y_train)
y_hat = gbm_model.predict(X_test)
gbm_rmse = mean_squared_error(y_test, y_hat, squared=False)

In [90]:
# cross validation
gbm_model = GradientBoostingRegressor()
gbm_cv_mse = cross_val_score(gbm_model, X_std, y, cv=5, scoring='neg_mean_squared_error')
gbm_cv_rmse = [np.sqrt(-x) for x in gbm_cv_mse]

In [91]:
gbm_cv_rmse

[0.021834533221382242,
 0.027382827870295345,
 0.023115591935100627,
 0.024378670377395974,
 0.026266897281823964]