In [3]:
# Dependencies
import pandas as pd
pd.set_option('display.max_columns', 68)
pd.options.display.float_format = '{:,.2f}'.format
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

In [219]:
# SQLite dependencies
import sqlite3
from sqlalchemy import create_engine, text
from sqlalchemy import Column, Integer, String, Float
from pandas_profiling import ProfileReport
#  SQLite DB creation and establishing connection
database_path = "NJ_County_DB.sqlite"
engine = create_engine(f"sqlite:///{database_path}", echo=True)
sqlite_connection = engine.connect()

In [None]:
# (SELECT county_name , year, AVG(house_value_index) AS house_value_index
# FROM nj_zillow_house_value_index GROUP BY 1,2)

In [402]:
sql_query = """
SELECT * FROM nj_zillow_house_value_index
 AS T1
INNER JOIN nj_population AS T2 ON T1.county_name = T2.county_name AND T1.year=T2.year
INNER JOIN nj_poverty_median_income AS T3 ON T1.county_name = T3.county_name AND T1.year=T3.year
INNER JOIN 
(SELECT county_name , year, AVG(tax_rate) AS tax_rate 
FROM nj_property_tax GROUP BY 1,2)AS T4 ON T1.county_name = T4.county_name AND T1.year=T4.year
INNER JOIN nj_mortgage_rates AS T5 ON T1.year=T5.year
;
"""
all_df = pd.read_sql(sql_query,sqlite_connection)
all_df.info()

2023-03-24 18:18:50,448 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("
SELECT * FROM nj_zillow_house_value_index
 AS T1
INNER JOIN nj_population AS T2 ON T1.county_name = T2.county_name AND T1.year=T2.year
INNER JOIN nj_poverty_median_income AS T3 ON T1.county_name = T3.county_name AND T1.year=T3.year
INNER JOIN 
(SELECT county_name , year, AVG(tax_rate) AS tax_rate 
FROM nj_property_tax GROUP BY 1,2)AS T4 ON T1.county_name = T4.county_name AND T1.year=T4.year
INNER JOIN nj_mortgage_rates AS T5 ON T1.year=T5.year
;
")
2023-03-24 18:18:50,450 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-03-24 18:18:50,455 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("
SELECT * FROM nj_zillow_house_value_index
 AS T1
INNER JOIN nj_population AS T2 ON T1.county_name = T2.county_name AND T1.year=T2.year
INNER JOIN nj_poverty_median_income AS T3 ON T1.county_name = T3.county_name AND T1.year=T3.year
INNER JOIN 
(SELECT county_name , year, AVG(tax_rate) AS tax_rate 
FROM nj_property_tax GR

In [403]:
all_df.describe()

Unnamed: 0,year,num_of_bedrooms,house_value_index,est_pop,year.1,median_hh_income,poverty_count,poverty_rate,year.2,year.3,tax_rate,year.4,apr_30,points_30,apr_15,points_15
count,1260.0,1260.0,1260.0,1260.0,1260.0,1260.0,1260.0,1260.0,1260.0,1260.0,1260.0,1260.0,1260.0,1260.0,1260.0,1260.0
mean,2015.5,3.0,301526.97,423663.9,2015.5,76414.14,42851.11,10.05,2015.5,2015.5,3.38,2015.5,3.92,0.63,3.22,0.6
std,3.45,1.41,174631.35,253661.54,3.45,18476.21,32803.23,4.06,3.45,3.45,1.65,3.45,0.5,0.09,0.51,0.09
min,2010.0,1.0,47502.88,62341.0,2010.0,45438.0,4793.0,3.9,2010.0,2010.0,0.97,2010.0,2.96,0.48,2.27,0.45
25%,2012.75,2.0,171183.5,150928.75,2012.75,61363.25,16560.5,6.67,2012.75,2012.75,2.47,2012.75,3.66,0.53,2.93,0.49
50%,2015.5,3.0,263753.4,448449.0,2015.5,74839.0,36293.5,9.45,2015.5,2015.5,2.97,2015.5,3.96,0.65,3.19,0.61
75%,2018.25,4.0,388531.82,603111.25,2018.25,87602.25,63450.25,13.1,2018.25,2018.25,3.69,2018.25,4.24,0.72,3.46,0.69
max,2021.0,5.0,1312995.76,953819.0,2021.0,123708.0,136161.0,20.1,2021.0,2021.0,9.66,2021.0,4.69,0.73,4.1,0.71


In [404]:
all_df.head()

Unnamed: 0,county_name,year,num_of_bedrooms,house_value_index,county_name.1,est_pop,year.1,county_name.2,median_hh_income,poverty_count,poverty_rate,st_abb,year.2,state_code,county_code,county_name.3,year.3,tax_rate,year.4,apr_30,points_30,apr_15,points_15
0,ATLANTIC,2010,1,120414.14,ATLANTIC,274654,2010,ATLANTIC,51457,36693,13.6,NJ,2010,34,1,ATLANTIC,2010,2.67,2010,4.69,0.72,4.1,0.67
1,ATLANTIC,2011,1,106680.39,ATLANTIC,274635,2011,ATLANTIC,49983,35108,13.1,NJ,2011,34,1,ATLANTIC,2011,2.65,2011,4.45,0.72,3.68,0.71
2,ATLANTIC,2012,1,100139.16,ATLANTIC,274657,2012,ATLANTIC,50881,38245,14.2,NJ,2012,34,1,ATLANTIC,2012,2.73,2012,3.66,0.72,2.93,0.68
3,ATLANTIC,2013,1,94991.76,ATLANTIC,274360,2013,ATLANTIC,51668,46281,17.1,NJ,2013,34,1,ATLANTIC,2013,2.87,2013,3.98,0.73,3.1,0.7
4,ATLANTIC,2014,1,92839.52,ATLANTIC,272634,2014,ATLANTIC,54208,40761,15.1,NJ,2014,34,1,ATLANTIC,2014,2.84,2014,4.17,0.59,3.29,0.57


In [405]:
all_df=all_df.loc[:,~all_df.columns.duplicated()].copy()
all_df

Unnamed: 0,county_name,year,num_of_bedrooms,house_value_index,est_pop,median_hh_income,poverty_count,poverty_rate,st_abb,state_code,county_code,tax_rate,apr_30,points_30,apr_15,points_15
0,ATLANTIC,2010,1,120414.14,274654,51457,36693,13.60,NJ,034,001,2.67,4.69,0.72,4.10,0.67
1,ATLANTIC,2011,1,106680.39,274635,49983,35108,13.10,NJ,034,001,2.65,4.45,0.72,3.68,0.71
2,ATLANTIC,2012,1,100139.16,274657,50881,38245,14.20,NJ,034,001,2.73,3.66,0.72,2.93,0.68
3,ATLANTIC,2013,1,94991.76,274360,51668,46281,17.10,NJ,034,001,2.87,3.98,0.73,3.10,0.70
4,ATLANTIC,2014,1,92839.52,272634,54208,40761,15.10,NJ,034,001,2.84,4.17,0.59,3.29,0.57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1255,WARREN,2017,5,337688.95,105761,79633,7770,7.40,NJ,034,041,3.28,3.99,0.50,3.27,0.49
1256,WARREN,2018,5,348528.58,105709,77571,7006,6.70,NJ,034,041,3.32,4.54,0.48,4.00,0.45
1257,WARREN,2019,5,352652.50,105455,83998,7313,7.10,NJ,034,041,3.36,3.94,0.52,3.39,0.47
1258,WARREN,2020,5,365908.92,105624,80412,7539,7.30,NJ,034,041,3.40,3.11,0.73,2.60,0.69


In [417]:
# Set features. This will also be used as x values.
X = all_df.drop(['county_name', 'st_abb','state_code','county_code', 'est_pop', 'points_30','points_15',\
                 'tax_rate','poverty_count','apr_30','apr_15'], axis=1)
y = all_df["county_name"]
print(X.shape, y.shape)

(1260, 5) (1260,)


In [418]:
# Split data into training and testing groups and scale data 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [419]:
# Fit the data into model
rfm = RandomForestClassifier(n_estimators=200)
rfm.fit(X_train, y_train)

In [420]:
sorted(zip(rfm.feature_importances_, X.columns), reverse=True)

[(0.3609237487342005, 'poverty_rate'),
 (0.3103848368846969, 'median_hh_income'),
 (0.15761129628656217, 'house_value_index'),
 (0.12514809140252514, 'year'),
 (0.04593202669201531, 'num_of_bedrooms')]

In [422]:
predictions = rfm.predict(X_test)
print(f"Training Data Score: {rfm.score(X_train, y_train)}")
print(f"Testing Data Score: {rfm.score(X_test, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.946031746031746


In [464]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

    ATLANTIC       0.85      0.85      0.85        13
      BERGEN       1.00      1.00      1.00        15
  BURLINGTON       1.00      0.94      0.97        16
      CAMDEN       1.00      1.00      1.00        14
    CAPE MAY       1.00      0.94      0.97        16
  CUMBERLAND       0.94      1.00      0.97        16
       ESSEX       0.89      0.84      0.86        19
  GLOUCESTER       0.90      1.00      0.95        18
      HUDSON       0.80      1.00      0.89        12
   HUNTERDON       1.00      1.00      1.00        14
      MERCER       1.00      1.00      1.00        12
   MIDDLESEX       1.00      1.00      1.00        17
    MONMOUTH       1.00      1.00      1.00        13
      MORRIS       0.82      0.82      0.82        17
       OCEAN       0.94      1.00      0.97        17
     PASSAIC       1.00      0.77      0.87        13
       SALEM       1.00      1.00      1.00        16
    SOMERSET       0.85    

# Prediction array

In [472]:
beds=3
income=80000
budget=400000
all_df_latest=all_df[all_df['year']==all_df['year'].max()]
# if (income==rfm_array['median_hh_income'])&(budget==rfm_array['house_value_index']) :
rfm_array=pd.DataFrame({'year':all_df_latest['year'].max(),
                        'num_of_bedrooms':beds,
                        'house_value_index':budget,
                        'median_hh_income':income,
                        'poverty_rate':all_df_latest['poverty_rate'].max()},[0])
rfm.predict(rfm_array)

array(['PASSAIC'], dtype=object)

In [463]:
#  save the model
filename = '../Models/NJ_rfm.sav'
joblib.dump(rfm, filename)

['../Models/NJ_rfm.sav']

# Hyper Parameter tuning

In [466]:
# Create the GridSearchCV model
param_grid = {
    'n_estimators': [200, 600, 1200, 100],
    'max_features': ['sqrt', 'log2'],
    'criterion': ['gini', 'entropy'],
    'max_depth':[None,5,10,15,20]
}
grid = GridSearchCV(rfm, param_grid, cv=3, verbose=3)

In [467]:
# Train the model with GridSearch
grid.fit(X_train, y_train)

Fitting 3 folds for each of 80 candidates, totalling 240 fits
[CV 1/3] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=200;, score=0.851 total time=   0.6s
[CV 2/3] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=200;, score=0.854 total time=   0.8s
[CV 3/3] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=200;, score=0.832 total time=   0.5s
[CV 1/3] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=600;, score=0.851 total time=   1.4s
[CV 2/3] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=600;, score=0.851 total time=   1.8s
[CV 3/3] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=600;, score=0.829 total time=   1.6s
[CV 1/3] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=1200;, score=0.848 total time=   2.9s
[CV 2/3] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=1200;, score=0.854 total time=   2.9s
[CV 3/3] END criterion=g

[CV 2/3] END criterion=gini, max_depth=15, max_features=sqrt, n_estimators=200;, score=0.835 total time=   0.4s
[CV 3/3] END criterion=gini, max_depth=15, max_features=sqrt, n_estimators=200;, score=0.822 total time=   0.4s
[CV 1/3] END criterion=gini, max_depth=15, max_features=sqrt, n_estimators=600;, score=0.851 total time=   1.3s
[CV 2/3] END criterion=gini, max_depth=15, max_features=sqrt, n_estimators=600;, score=0.860 total time=   1.3s
[CV 3/3] END criterion=gini, max_depth=15, max_features=sqrt, n_estimators=600;, score=0.822 total time=   1.3s
[CV 1/3] END criterion=gini, max_depth=15, max_features=sqrt, n_estimators=1200;, score=0.857 total time=   2.7s
[CV 2/3] END criterion=gini, max_depth=15, max_features=sqrt, n_estimators=1200;, score=0.854 total time=   2.6s
[CV 3/3] END criterion=gini, max_depth=15, max_features=sqrt, n_estimators=1200;, score=0.822 total time=   2.7s
[CV 1/3] END criterion=gini, max_depth=15, max_features=sqrt, n_estimators=100;, score=0.838 total ti

[CV 2/3] END criterion=entropy, max_depth=5, max_features=sqrt, n_estimators=200;, score=0.632 total time=   0.4s
[CV 3/3] END criterion=entropy, max_depth=5, max_features=sqrt, n_estimators=200;, score=0.663 total time=   0.4s
[CV 1/3] END criterion=entropy, max_depth=5, max_features=sqrt, n_estimators=600;, score=0.705 total time=   1.3s
[CV 2/3] END criterion=entropy, max_depth=5, max_features=sqrt, n_estimators=600;, score=0.660 total time=   1.4s
[CV 3/3] END criterion=entropy, max_depth=5, max_features=sqrt, n_estimators=600;, score=0.667 total time=   1.3s
[CV 1/3] END criterion=entropy, max_depth=5, max_features=sqrt, n_estimators=1200;, score=0.686 total time=   2.7s
[CV 2/3] END criterion=entropy, max_depth=5, max_features=sqrt, n_estimators=1200;, score=0.654 total time=   3.0s
[CV 3/3] END criterion=entropy, max_depth=5, max_features=sqrt, n_estimators=1200;, score=0.670 total time=   3.0s
[CV 1/3] END criterion=entropy, max_depth=5, max_features=sqrt, n_estimators=100;, sc

[CV 2/3] END criterion=entropy, max_depth=20, max_features=sqrt, n_estimators=200;, score=0.854 total time=   0.5s
[CV 3/3] END criterion=entropy, max_depth=20, max_features=sqrt, n_estimators=200;, score=0.841 total time=   0.5s
[CV 1/3] END criterion=entropy, max_depth=20, max_features=sqrt, n_estimators=600;, score=0.851 total time=   2.2s
[CV 2/3] END criterion=entropy, max_depth=20, max_features=sqrt, n_estimators=600;, score=0.851 total time=   2.6s
[CV 3/3] END criterion=entropy, max_depth=20, max_features=sqrt, n_estimators=600;, score=0.848 total time=   1.8s
[CV 1/3] END criterion=entropy, max_depth=20, max_features=sqrt, n_estimators=1200;, score=0.857 total time=   4.0s
[CV 2/3] END criterion=entropy, max_depth=20, max_features=sqrt, n_estimators=1200;, score=0.867 total time=   4.2s
[CV 3/3] END criterion=entropy, max_depth=20, max_features=sqrt, n_estimators=1200;, score=0.844 total time=   4.4s
[CV 1/3] END criterion=entropy, max_depth=20, max_features=sqrt, n_estimators

In [468]:
print(grid.best_params_)
print(grid.best_score_)

{'criterion': 'entropy', 'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 100}
0.8634920634920635


In [469]:
rfm_tuned = RandomForestClassifier(**grid.best_params_)
rfm_tuned.fit(X_train, y_train)

In [470]:
predictions = rfm_tuned.predict(X_test)
print(f"Training Data Score: {rfm_tuned.score(X_train, y_train)}")
print(f"Testing Data Score: {rfm_tuned.score(X_test, y_test)}")

Training Data Score: 1.0
Testing Data Score: 0.9333333333333333


Tuned model has less accuracy so we went with the base model.