# Packages

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 68)
pd.options.display.float_format = '{:,.2f}'.format
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.metrics import classification_report
import joblib
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Data Import and EDA

In [2]:
df = pd.read_csv('../Resources/final_data2.csv')
df.head()

Unnamed: 0,county_name,year,num_of_bedrooms,house_value_index,est_pop,median_hh_income,poverty_count,poverty_rate,st_abb,state_code,county_code,tax_rate,apr_30,points_30,apr_15,points_15
0,ATLANTIC,2010,1,120414.14,274654,51457,36693,13.6,NJ,34,1,2.67,4.69,0.72,4.1,0.67
1,ATLANTIC,2011,1,106680.39,274635,49983,35108,13.1,NJ,34,1,2.65,4.45,0.72,3.68,0.71
2,ATLANTIC,2012,1,100139.16,274657,50881,38245,14.2,NJ,34,1,2.73,3.66,0.72,2.93,0.68
3,ATLANTIC,2013,1,94991.76,274360,51668,46281,17.1,NJ,34,1,2.87,3.98,0.73,3.1,0.7
4,ATLANTIC,2014,1,92839.52,272634,54208,40761,15.1,NJ,34,1,2.84,4.17,0.59,3.29,0.57


In [3]:
df.shape

(1260, 16)

In [4]:
df.describe()

Unnamed: 0,year,num_of_bedrooms,house_value_index,est_pop,median_hh_income,poverty_count,poverty_rate,state_code,county_code,tax_rate,apr_30,points_30,apr_15,points_15
count,1260.0,1260.0,1260.0,1260.0,1260.0,1260.0,1260.0,1260.0,1260.0,1260.0,1260.0,1260.0,1260.0,1260.0
mean,2015.5,3.0,301526.97,423663.9,76414.14,42851.11,10.05,34.0,21.0,3.38,3.92,0.63,3.22,0.6
std,3.45,1.41,174631.35,253661.54,18476.21,32803.23,4.06,0.0,12.12,1.65,0.5,0.09,0.51,0.09
min,2010.0,1.0,47502.88,62341.0,45438.0,4793.0,3.9,34.0,1.0,0.97,2.96,0.48,2.27,0.45
25%,2012.75,2.0,171183.5,150928.75,61363.25,16560.5,6.67,34.0,11.0,2.47,3.66,0.53,2.93,0.49
50%,2015.5,3.0,263753.4,448449.0,74839.0,36293.5,9.45,34.0,21.0,2.97,3.96,0.65,3.19,0.61
75%,2018.25,4.0,388531.82,603111.25,87602.25,63450.25,13.1,34.0,31.0,3.69,4.24,0.72,3.46,0.69
max,2021.0,5.0,1312995.76,953819.0,123708.0,136161.0,20.1,34.0,41.0,9.66,4.69,0.73,4.1,0.71


In [5]:
# label encoding of counties
le = LabelEncoder()
df['county_label'] = le.fit_transform(df['county_name'])

In [6]:
df.head()

Unnamed: 0,county_name,year,num_of_bedrooms,house_value_index,est_pop,median_hh_income,poverty_count,poverty_rate,st_abb,state_code,county_code,tax_rate,apr_30,points_30,apr_15,points_15,county_label
0,ATLANTIC,2010,1,120414.14,274654,51457,36693,13.6,NJ,34,1,2.67,4.69,0.72,4.1,0.67,0
1,ATLANTIC,2011,1,106680.39,274635,49983,35108,13.1,NJ,34,1,2.65,4.45,0.72,3.68,0.71,0
2,ATLANTIC,2012,1,100139.16,274657,50881,38245,14.2,NJ,34,1,2.73,3.66,0.72,2.93,0.68,0
3,ATLANTIC,2013,1,94991.76,274360,51668,46281,17.1,NJ,34,1,2.87,3.98,0.73,3.1,0.7,0
4,ATLANTIC,2014,1,92839.52,272634,54208,40761,15.1,NJ,34,1,2.84,4.17,0.59,3.29,0.57,0


In [7]:
ref_df = df[['county_name','county_label']].drop_duplicates().reset_index(drop=True)
ref_df

Unnamed: 0,county_name,county_label
0,ATLANTIC,0
1,BERGEN,1
2,BURLINGTON,2
3,CAMDEN,3
4,CAPE MAY,4
5,CUMBERLAND,5
6,ESSEX,6
7,GLOUCESTER,7
8,HUDSON,8
9,HUNTERDON,9


# Train test split

In [47]:
# feature selection
X = df.drop(["house_value_index", 'county_name', 'st_abb','state_code','county_code','apr_30','points_30','apr_15','points_15'], axis=1)
y = df["house_value_index"]
print(X.shape, y.shape)

(1260, 8) (1260,)


In [48]:
# Split data into training and testing groups and scale data 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Model fit

In [49]:
# Fit the data into model
rfm = RandomForestRegressor(n_estimators=200)
rfm.fit(X_train, y_train)

In [50]:
sorted(zip(rfm.feature_importances_, X.columns), reverse=True)

[(0.5306595114263427, 'num_of_bedrooms'),
 (0.1396069078584677, 'tax_rate'),
 (0.10944174975601374, 'est_pop'),
 (0.07608939712704978, 'median_hh_income'),
 (0.04986793563321422, 'year'),
 (0.03541293721712579, 'county_label'),
 (0.03336319510425408, 'poverty_count'),
 (0.025558365877532137, 'poverty_rate')]

In [51]:
predictions = rfm.predict(X_test)
print(f"Training Data Score: {rfm.score(X_train, y_train)}")
print(f"Testing Data Score: {rfm.score(X_test, y_test)}")
print(f"Production Data Score: {rfm.score(X, y)}")

Training Data Score: 0.9937723196024478
Testing Data Score: 0.9707401005841938
Production Data Score: 0.9885677436372031


In [52]:
from sklearn.metrics import mean_squared_error as MSE

In [77]:
MSE_score = MSE(y_test,predictions)
print("Mean Squared Error: ",MSE_score.mean())
print("Root Mean Squared Error: ",np.sqrt(MSE_score.mean()))

Mean Squared Error:  806404319.2723241
Root Mean Squared Error:  28397.25900984678


# Hyperparameter Tuning

In [19]:
# Get randomforest params
rfm.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [25]:
# Create the GridSearchCV model
param_grid = {
    'n_estimators': [200, 250, 150],
    'max_features': [1, 'sqrt', 'log2'],
    'criterion': ['squared_error', 'absolute_error'],
    'max_depth': [None, 5, 10]
}
grid = GridSearchCV(rfm, param_grid, cv=5, verbose=3)

In [26]:
# Train the model with GridSearch
grid.fit(X_train, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV 1/5] END criterion=squared_error, max_depth=None, max_features=1, n_estimators=200;, score=0.695 total time=   0.3s
[CV 2/5] END criterion=squared_error, max_depth=None, max_features=1, n_estimators=200;, score=0.707 total time=   0.3s
[CV 3/5] END criterion=squared_error, max_depth=None, max_features=1, n_estimators=200;, score=0.723 total time=   0.4s
[CV 4/5] END criterion=squared_error, max_depth=None, max_features=1, n_estimators=200;, score=0.699 total time=   0.3s
[CV 5/5] END criterion=squared_error, max_depth=None, max_features=1, n_estimators=200;, score=0.720 total time=   0.3s
[CV 1/5] END criterion=squared_error, max_depth=None, max_features=1, n_estimators=250;, score=0.703 total time=   0.5s
[CV 2/5] END criterion=squared_error, max_depth=None, max_features=1, n_estimators=250;, score=0.705 total time=   0.4s
[CV 3/5] END criterion=squared_error, max_depth=None, max_features=1, n_estimators=250;, score=0.7

[CV 4/5] END criterion=squared_error, max_depth=5, max_features=sqrt, n_estimators=250;, score=0.767 total time=   0.3s
[CV 5/5] END criterion=squared_error, max_depth=5, max_features=sqrt, n_estimators=250;, score=0.721 total time=   0.3s
[CV 1/5] END criterion=squared_error, max_depth=5, max_features=sqrt, n_estimators=150;, score=0.749 total time=   0.1s
[CV 2/5] END criterion=squared_error, max_depth=5, max_features=sqrt, n_estimators=150;, score=0.759 total time=   0.1s
[CV 3/5] END criterion=squared_error, max_depth=5, max_features=sqrt, n_estimators=150;, score=0.788 total time=   0.2s
[CV 4/5] END criterion=squared_error, max_depth=5, max_features=sqrt, n_estimators=150;, score=0.760 total time=   0.1s
[CV 5/5] END criterion=squared_error, max_depth=5, max_features=sqrt, n_estimators=150;, score=0.754 total time=   0.1s
[CV 1/5] END criterion=squared_error, max_depth=5, max_features=log2, n_estimators=200;, score=0.745 total time=   0.2s
[CV 2/5] END criterion=squared_error, ma

[CV 3/5] END criterion=absolute_error, max_depth=None, max_features=1, n_estimators=200;, score=0.708 total time=   1.1s
[CV 4/5] END criterion=absolute_error, max_depth=None, max_features=1, n_estimators=200;, score=0.687 total time=   1.1s
[CV 5/5] END criterion=absolute_error, max_depth=None, max_features=1, n_estimators=200;, score=0.680 total time=   1.2s
[CV 1/5] END criterion=absolute_error, max_depth=None, max_features=1, n_estimators=250;, score=0.692 total time=   1.4s
[CV 2/5] END criterion=absolute_error, max_depth=None, max_features=1, n_estimators=250;, score=0.654 total time=   1.4s
[CV 3/5] END criterion=absolute_error, max_depth=None, max_features=1, n_estimators=250;, score=0.697 total time=   1.4s
[CV 4/5] END criterion=absolute_error, max_depth=None, max_features=1, n_estimators=250;, score=0.676 total time=   1.4s
[CV 5/5] END criterion=absolute_error, max_depth=None, max_features=1, n_estimators=250;, score=0.674 total time=   1.4s
[CV 1/5] END criterion=absolute_

[CV 1/5] END criterion=absolute_error, max_depth=5, max_features=sqrt, n_estimators=150;, score=0.754 total time=   0.8s
[CV 2/5] END criterion=absolute_error, max_depth=5, max_features=sqrt, n_estimators=150;, score=0.735 total time=   1.0s
[CV 3/5] END criterion=absolute_error, max_depth=5, max_features=sqrt, n_estimators=150;, score=0.771 total time=   1.2s
[CV 4/5] END criterion=absolute_error, max_depth=5, max_features=sqrt, n_estimators=150;, score=0.749 total time=   1.0s
[CV 5/5] END criterion=absolute_error, max_depth=5, max_features=sqrt, n_estimators=150;, score=0.685 total time=   1.0s
[CV 1/5] END criterion=absolute_error, max_depth=5, max_features=log2, n_estimators=200;, score=0.734 total time=   1.3s
[CV 2/5] END criterion=absolute_error, max_depth=5, max_features=log2, n_estimators=200;, score=0.710 total time=   1.3s
[CV 3/5] END criterion=absolute_error, max_depth=5, max_features=log2, n_estimators=200;, score=0.739 total time=   1.3s
[CV 4/5] END criterion=absolute_

In [27]:
print(grid.best_params_)
print(grid.best_score_)

{'criterion': 'absolute_error', 'max_depth': None, 'max_features': 'log2', 'n_estimators': 150}
0.8969408131635307


In [28]:
rfm_tuned = RandomForestRegressor(**grid.best_params_)
rfm_tuned.fit(X_train, y_train)

In [29]:
predictions = rfm_tuned.predict(X_test)
print(f"Training Data Score: {rfm_tuned.score(X_train, y_train)}")
print(f"Testing Data Score: {rfm_tuned.score(X_test, y_test)}")
print(f"Production Data Score: {rfm_tuned.score(X, y)}")

Training Data Score: 0.9816532650164224
Testing Data Score: 0.9223066290289834
Production Data Score: 0.9682439423054952


In [28]:
MSE_score = MSE(y_test,predictions)
print("Mean Squared Error: ",MSE_score.mean())
print("Root Mean Squared Error: ",np.sqrt(MSE_score.mean()))

Mean Squared Error:  6933647341.006582
Root Mean Squared Error:  83268.52551238422


The base model works better in this case.

# Export model

In [78]:
#  save the model
filename = '../Models/NJ_rfm_house_price2.sav'
joblib.dump(rfm, filename)

['../Models/NJ_rfm_house_price2.sav']

# Array for prediction

In [2]:
rfm = joblib.load('../Models/NJ_rfm_house_price.sav')

In [54]:
X.columns

Index(['year', 'num_of_bedrooms', 'est_pop', 'median_hh_income',
       'poverty_count', 'poverty_rate', 'tax_rate', 'county_label'],
      dtype='object')

In [75]:
county='UNION'
beds=3
year = 2026
df_filtered=df[(df['county_name']==county)][['county_name','year', 'num_of_bedrooms', 'est_pop', 'median_hh_income',
       'poverty_count', 'poverty_rate', 'tax_rate', 'county_label']]
county_label=ref_df[ref_df['county_name']==county]['county_label'].values[0]
best_array = df_filtered[(df_filtered['county_name']==county)&\
              (df_filtered['num_of_bedrooms']==beds)].tail(1)
best_array['year']=year
best_array['county_label']=county_label
# best_array['est_pop']=df_filtered['est_pop'].max()
# best_array['median_hh_income']=df_filtered['median_hh_income'].max()
# best_array['poverty_count']=df_filtered['poverty_count'].max()
# best_array['poverty_rate']=df_filtered['poverty_rate'].max()
# best_array['tax_rate']=df_filtered['tax_rate'].max()
best_array.reset_index(drop=True,inplace=True)
best_array

Unnamed: 0,county_name,year,num_of_bedrooms,est_pop,median_hh_income,poverty_count,poverty_rate,tax_rate,county_label
0,UNION,2026,3,572114,85926,52354,9.3,8.42,19


In [76]:
rfm.predict(best_array.drop('county_name',axis=1))[0]

410731.4104748989

In [73]:
county='ESSEX'
beds=3
year = 2026
df_filtered=df[(df['county_name']==county)][['county_name','year', 'num_of_bedrooms', 'est_pop', 'median_hh_income',
       'poverty_count', 'poverty_rate', 'tax_rate', 'county_label']]
county_label=ref_df[ref_df['county_name']==county]['county_label'].values[0]
best_array = df_filtered[(df_filtered['county_name']==county)&\
              (df_filtered['num_of_bedrooms']==beds)].head(1)
best_array['year']=year
best_array['county_label']=county_label
# best_array['est_pop']=df_filtered['est_pop'].max()
# best_array['median_hh_income']=df_filtered['median_hh_income'].max()
# best_array['poverty_count']=df_filtered['poverty_count'].max()
# best_array['poverty_rate']=df_filtered['poverty_rate'].max()
# best_array['tax_rate']=df_filtered['tax_rate'].max()
best_array.reset_index(drop=True,inplace=True)
best_array

Unnamed: 0,county_name,year,num_of_bedrooms,est_pop,median_hh_income,poverty_count,poverty_rate,tax_rate,county_label
0,ESSEX,2026,3,784017,52288,125503,16.4,3.13,6


In [74]:
rfm.predict(best_array.drop('county_name',axis=1))[0]

401481.3393907867

In [42]:
county='UNION'
beds=3
year = 2024
df_filtered=df[(df['county_name']==county)&(df['year']==df['year'].max())]
corr = df_filtered.corr()
corr.fillna(0, inplace=True)
county_label=ref_df[ref_df['county_name']==county]['county_label'].values[0]
best_array = df[(df['county_name']==county)&\
              (df['num_of_bedrooms']==beds)&(df['year']==df['year'].max())]
best_array['year']=year
best_array['county_label']=county_label
for i in best_array.drop(['county_name', 'year', 'house_value_index', 'county_label', 'st_abb','state_code','county_code'],axis=1).columns.tolist():
    best_array[i]=abs(best_array[i]+(year-2020)*corr.loc['year',i])
best_array.reset_index(drop=True,inplace=True)
best_array

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_array['year']=year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_array['county_label']=county_label
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_array[i]=abs(best_array[i]+(year-2020)*corr.loc['year',i])


Unnamed: 0,county_name,year,num_of_bedrooms,house_value_index,est_pop,median_hh_income,poverty_count,poverty_rate,st_abb,state_code,county_code,tax_rate,apr_30,points_30,apr_15,points_15,county_label
0,UNION,2024,3.0,435056.61,572114.0,85926.0,52354.0,9.3,NJ,34,39,8.42,2.96,0.68,2.27,0.64,19


In [43]:
rfm.predict(best_array.drop(['county_name','house_value_index', 'st_abb','state_code','county_code'],axis=1))[0]

406771.3838783805

In [34]:
df[(df[f'county_name']==county)&\
   (df['num_of_bedrooms']==beds)][['county_name', 'year',\
                                   'num_of_bedrooms','house_value_index']].tail(1)

Unnamed: 0,county_name,year,num_of_bedrooms,house_value_index
412,UNION,2020,3,377759.26


In [29]:
county='UNION'
beds=3
yr = 2026
df_filtered=df[(df['county_name']==county)]
corr = df_filtered.corr()
corr.fillna(0, inplace=True)
county_label=ref_df[ref_df['county_name']==county]['county_label'].values[0]
worst_array = df[(df['county_name']==county)&\
              (df['num_of_bedrooms']==beds)].head(1)
worst_array['year']=yr
worst_array['county_label']=county_label
for i in worst_array.drop(['county_name', 'year', 'house_value_index', 'county_label'],axis=1).columns.tolist():
    worst_array[i]=abs(worst_array[i]-(year-2020)*corr.loc['year',i])
worst_array.reset_index(drop=True,inplace=True)
worst_array

Unnamed: 0,county_name,year,min_tax_rate,avg_tax_rate,max_tax_rate,murder,rape,robbery,assault,burglary,larceny,auto_theft,avg_std_cnt_ele,avg_exp_ele,avg_score_ele,min_std_cnt_ele,min_exp_ele,min_score_ele,max_std_cnt_ele,max_exp_ele,max_score_ele,avg_std_cnt_mid,avg_exp_mid,avg_score_mid,min_std_cnt_mid,min_exp_mid,min_score_mid,max_std_cnt_mid,max_exp_mid,max_score_mid,avg_std_cnt_high,avg_exp_high,avg_score_high,min_std_cnt_high,min_exp_high,min_score_high,max_std_cnt_high,max_exp_high,max_score_high,est_pop,apr_30,points_30,apr_15,points_15,num_of_bedrooms,house_value_index,median_hh_income,poverty_count,poverty_rate,county_label
0,UNION,2026,9.26,14.83,21.88,13.42,94.39,840.66,779.22,1618.97,7566.0,1444.6,626.72,9658.08,43.24,139.47,6032.65,1.28,2223.58,15980.0,95.65,626.72,9658.08,43.24,139.47,6032.65,1.28,2223.58,15980.0,95.65,626.72,9658.08,43.24,139.47,6032.65,1.28,2223.58,15980.0,95.65,554693.09,8.24,4.42,6.81,3.89,3.0,322550.09,76824.6,52051.56,8.2,19


In [30]:
df[(df[f'county_name']==county)&\
   (df['num_of_bedrooms']==beds)][['county_name', 'year', 'num_of_bedrooms','house_value_index']].tail(1)

Unnamed: 0,county_name,year,num_of_bedrooms,house_value_index
412,UNION,2020,3,377759.26


In [36]:
value = rfm.predict(worst_array.drop(['county_name','house_value_index', 'county_label'],axis=1))[0] + 2*np.sqrt(MSE_score.mean())
value

354041.48059682467