In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

zillow = pd.read_csv('data/zillow_201808.csv')

In [4]:
zillow.columns

Index(['RegionID', 'RegionName', 'State', 'SizeRank',
       'price_to_rent_avg_201609_201708', 'price_to_rent_3yravg_2014_2017',
       'value_yoy_gr_2016_2018', 'rent_yoy_gr_2015_2017',
       'rent_3yravg_yoy_gr_2013_2017', 'sale_count_yoy_gr_2015_2017',
       'sale_count_3yravg_yoy_gr_2013_2017', 'turnover_yoy_gr_2015_2017',
       'turnover_3yravg_yoy_gr_2013_2017', 'pct_reduced_yoy_gr_2015_2017',
       'pct_reduced_3yravg_yoy_gr_2013_2017', 'sale_price_yoy_gr_2015_2017',
       'sale_price_3yravg_yoy_gr_2013_2017', 'ptrg_1yr_201609_201708',
       'ptrg_3yr_2013_2017'],
      dtype='object')

In [5]:
feature_cols = zillow.columns.drop(['value_yoy_gr_2016_2018', 'RegionID', 'RegionName', 'State', 'SizeRank'])

X = zillow[feature_cols]

y = zillow.value_yoy_gr_2016_2018

In [6]:
from sklearn.ensemble import RandomForestRegressor

In [7]:
# max_features=5 is best and n_estimators=150 is sufficiently large.
rfreg = RandomForestRegressor(n_estimators=150, max_features=5, oob_score=True, random_state=99)
rfreg.fit(X, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=5, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=150, n_jobs=1, oob_score=True, random_state=99,
           verbose=0, warm_start=False)

In [8]:
# Compute feature importances.
pd.DataFrame({'feature':feature_cols, 'importance':rfreg.feature_importances_}).sort_values(by='importance')

Unnamed: 0,feature,importance
5,sale_count_3yravg_yoy_gr_2013_2017,0.028345
4,sale_count_yoy_gr_2015_2017,0.028889
6,turnover_yoy_gr_2015_2017,0.031503
7,turnover_3yravg_yoy_gr_2013_2017,0.034924
13,ptrg_3yr_2013_2017,0.036649
12,ptrg_1yr_201609_201708,0.046573
8,pct_reduced_yoy_gr_2015_2017,0.047227
3,rent_3yravg_yoy_gr_2013_2017,0.066735
2,rent_yoy_gr_2015_2017,0.072527
0,price_to_rent_avg_201609_201708,0.081919


In [9]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score

# Compute the out-of-bag R-squared score.
print((rfreg.oob_score_))

# Find the average RMSE.
scores = cross_val_score(rfreg, X, y, cv=10, scoring='neg_mean_squared_error')
np.mean(np.sqrt(-scores))

0.5134756056312553


0.030213010316276358

In [11]:
print('Out-of-bag R-squared score')
print('0.5134756056312553')
print('Average RMSE')
print('0.030213010316276358')
print('--------------------')
print('Null RMSE - Always predicting the mean response value')
print('0.042235267683440235')

Out-of-bag R-squared score
0.5134756056312553
Average RMSE
0.030213010316276358
--------------------
Null RMSE - Always predicting the mean response value
0.042235267683440235
