In [1]:
# importing necessary modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression

In [2]:
# import data

df = pd.read_csv("./data/listings_eda")
df.head()

Unnamed: 0,id,name,borough,neighbourhood,room_type_encoded,price,bathrooms,accommodates,bedrooms,beds
0,5136,"Spacious Brooklyn Duplex, Patio + Garden",Brooklyn,Sunset Park,0,275,1.0,4,2.0,2.0
1,5203,Cozy Clean Guest Room - Family Apt,Manhattan,Upper West Side,1,75,1.0,1,1.0,1.0
2,5121,BlissArtsSpace!,Brooklyn,Bedford-Stuyvesant,1,60,1.0,2,1.0,1.0
3,5178,Large Furnished Room Near B'way,Manhattan,Midtown,1,68,1.0,2,1.0,1.0
4,6872,Uptown Sanctuary w/ Private Bath (Month to Month),Manhattan,East Harlem,1,65,1.0,1,1.0,1.0


In [3]:
df.isna().sum()

id                   0
name                 0
borough              0
neighbourhood        0
room_type_encoded    0
price                0
bathrooms            0
accommodates         0
bedrooms             0
beds                 0
dtype: int64

In [4]:
len(df.neighbourhood.unique()) -1 + len(df.borough.unique()) - 1 + 11

236

In [5]:
#creating the dummies and merging them back to the original dataframe
cols_to_encode = ['neighbourhood', 'borough']
dummies = pd.get_dummies(df[cols_to_encode], prefix=cols_to_encode, drop_first=True)
df = pd.concat([df, dummies], axis=1)


In [6]:
df.head()

Unnamed: 0,id,name,borough,neighbourhood,room_type_encoded,price,bathrooms,accommodates,bedrooms,beds,...,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,borough_Brooklyn,borough_Manhattan,borough_Queens,borough_Staten Island
0,5136,"Spacious Brooklyn Duplex, Patio + Garden",Brooklyn,Sunset Park,0,275,1.0,4,2.0,2.0,...,0,0,0,0,0,0,1,0,0,0
1,5203,Cozy Clean Guest Room - Family Apt,Manhattan,Upper West Side,1,75,1.0,1,1.0,1.0,...,0,0,0,0,0,0,0,1,0,0
2,5121,BlissArtsSpace!,Brooklyn,Bedford-Stuyvesant,1,60,1.0,2,1.0,1.0,...,0,0,0,0,0,0,1,0,0,0
3,5178,Large Furnished Room Near B'way,Manhattan,Midtown,1,68,1.0,2,1.0,1.0,...,0,0,0,0,0,0,0,1,0,0
4,6872,Uptown Sanctuary w/ Private Bath (Month to Month),Manhattan,East Harlem,1,65,1.0,1,1.0,1.0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
df1 = df.drop(['id','name','neighbourhood', 'borough'], axis=1)

In [8]:
df1.price.mean()

178.00104113831122

In [9]:
len(df1) * .7, len(df1) *.3

(26221.3, 11237.699999999999)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df1.drop(columns='price'),
                                                    df1.price, test_size=0.3,
                                                    random_state=18)

In [11]:
X_train.shape, X_test.shape

((26221, 230), (11238, 230))

In [12]:
X_train.dtypes.value_counts(), X_test.dtypes.value_counts()

(uint8      225
 float64      3
 int64        2
 dtype: int64,
 uint8      225
 float64      3
 int64        2
 dtype: int64)

I'll start off by using the mean as a predictor to establish a baseline.

In [13]:
dumb_reg = DummyRegressor(strategy='mean')
dumb_reg.fit(X_train, y_train)
dumb_reg.constant_

array([[177.9889783]])

In [14]:
y_tr_pred = dumb_reg.predict(X_train)
y_te_pred = dumb_reg.predict(X_test)
print(r2_score(y_train, y_tr_pred), ((r2_score(y_test, y_te_pred))))

0.0 -5.927664825655654e-08


In [15]:
mse_tr = mean_squared_error(y_train, y_tr_pred)
mse_te = mean_squared_error(y_test, y_te_pred)
rmse = np.sqrt(mse_te)

print(rmse)

165.14858528894624


In [16]:
mae = mean_absolute_error(y_test, y_te_pred)
print("By using the average price of the listings as a predictor, we would be off by an average of ${} from the true values".format(round(mae,2)))

By using the average price of the listings as a predictor, we would be off by an average of $109.91 from the true values


And now a randomforest to compare

In [17]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(random_state=18)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)

In [18]:
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred) 
print("R-squared:", r2)
print("RMSE:", np.sqrt(mse))

R-squared: 0.46376569900475006
RMSE: 120.93503657270503


In [19]:
mae = mean_absolute_error(y_test, y_pred)
print("This random forest model's predictions would be off by an average of ${} from the true values".format(round(mae,2)))

This random forest model's predictions would be off by an average of $68.84 from the true values


In [20]:
rfr.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 18,
 'verbose': 0,
 'warm_start': False}

In the next sectioon I will tune the parameters of the RF model and cross validate its performance. 

In [21]:
df.to_csv('./data/modelling_data', index=False)