In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
import pandas as pd 
import numpy as np
import os
import string
from math import sqrt
from config import u, p

In [2]:
# Evaluation Metrics
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score as rs
from sklearn.metrics import mean_absolute_error as mae

# to split train and test set
from sklearn.model_selection import train_test_split

# to perform hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

#models
from sklearn.linear_model import Ridge  # Linear Regression + L2 regularization
from sklearn.linear_model import Lasso  # Linear Regression + L1 regularization
from sklearn.svm import SVR # Support Vector Regressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.tree import DecisionTreeRegressor

In [3]:
# Import SQLAlchemy `automap` and other dependencies here
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, inspect, func

In [4]:
# to save the final model on disk
from sklearn.externals import joblib

In [5]:
from psycopg2.extensions import register_adapter, AsIs
def adapt_numpy_int64(numpy_int64):
  return AsIs(numpy_int64)
register_adapter(np.int64, adapt_numpy_int64)

## Connect to DB: Create sample data that joins Acocunt Info, Appraisal Data, & Res_Adl

In [10]:
database_path = f'postgres://{u}:{p}@database1.cpwzlmglu2fg.us-east-2.rds.amazonaws.com/proptax'

In [11]:
# Create Engine
engine = create_engine(database_path)
conn = engine.connect()

In [12]:
# The ORM’s “handle” to the database is the Session.
session = Session(engine)

### Sample 2019

In [13]:
query_one = "select ai.account_num, ai.appraisal_yr, aay.tot_val, aay.prev_mkt_val, CASE WHEN aay.tot_val < aay.prev_mkt_val THEN 1 ELSE 0 END as Decrease, \
       street_num, street_half_num, full_street_name, property_city, property_zipcode, nbhd_cd, desirability_rating, bldg_class_cd, tot_living_area_sf, foundation_typ_desc, heating_typ_desc, ac_typ_desc, ext_wall_desc, roof_typ_desc, num_fireplaces, num_kitchens, num_full_baths, num_half_baths, num_wet_bars, num_bedrooms, sprinkler_sys_ind, pool_ind, \
       l.area_size \
        from account_info_2019 as ai \
        INNER JOIN account_apprl_year_2019 as aay on ai.account_num = aay.account_num \
        INNER JOIN res_detail_2019 as rd on aay.account_num = rd.account_num \
        INNER JOIN land as l on ai.account_num = l.account_num\
        where ai.division_cd = 'RES'"
#and nbhd_cd = '2RSS04'"

res_2019_data = pd.read_sql(query_one, conn).set_index('account_num')

## Explore Sample Data 

In [14]:
# 2019
res_2019_data.head(10)

Unnamed: 0_level_0,appraisal_yr,tot_val,prev_mkt_val,decrease,street_num,street_half_num,full_street_name,property_city,property_zipcode,nbhd_cd,...,roof_typ_desc,num_fireplaces,num_kitchens,num_full_baths,num_half_baths,num_wet_bars,num_bedrooms,sprinkler_sys_ind,pool_ind,area_size
account_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
008222000M0130000,2019,181500.0,181500.0,0,15209,,KING OF SPAIN CT,DALLAS,752486431,2DS112,...,HIP,1,1,2,1,0,2,Y,N,1234.0
00000574711000000,2019,186540.0,169930.0,0,11832,,RUPLEY LN,DALLAS,752182068,3DSG01,...,GABLE,0,1,2,0,0,3,N,N,7255.0
00000120658000000,2019,508910.0,474890.0,0,5111,,WORTH ST,DALLAS,752145351,1DSM19,...,GABLE,1,1,2,1,0,4,N,N,6153.0
00000184093000000,2019,330380.0,303070.0,0,6026,,VELASCO AVE,DALLAS,752066332,1DSJ12,...,GABLE,0,2,2,0,0,2,N,N,7785.0
00000450313000000,2019,49310.0,21590.0,0,2243,,EXETER DR,DALLAS,752166403,4DSA17,...,GABLE,0,1,1,0,0,2,N,N,6637.0
00000705280000000,2019,142060.0,142060.0,0,2120,,TOLOSA DR,DALLAS,752282020,3DSG24,...,HIP,0,1,1,0,0,3,N,N,7697.0
00000570625000000,2019,109720.0,40420.0,0,8700,,DUNLAP DR,DALLAS,752174245,4DSD11,...,GABLE,0,1,1,0,0,3,N,N,9216.0
00000574627000000,2019,141380.0,125340.0,0,11609,,FLAMINGO LN,DALLAS,752182006,3DSG01,...,GABLE,0,1,1,0,0,3,N,N,7513.0
00000436900000000,2019,120640.0,103980.0,0,6139,,PARKDALE DR,DALLAS,752273614,4DSG03,...,GABLE,0,1,1,0,0,3,N,N,9754.0
00000568027000000,2019,98610.0,68860.0,0,8450,,NISQUALLY ST,DALLAS,752174380,4DSE06,...,HIP,0,1,1,0,0,2,N,N,7920.0


In [15]:
res_2019_data.shape

(647881, 27)

In [16]:
res_2019_data.columns

Index(['appraisal_yr', 'tot_val', 'prev_mkt_val', 'decrease', 'street_num',
       'street_half_num', 'full_street_name', 'property_city',
       'property_zipcode', 'nbhd_cd', 'desirability_rating', 'bldg_class_cd',
       'tot_living_area_sf', 'foundation_typ_desc', 'heating_typ_desc',
       'ac_typ_desc', 'ext_wall_desc', 'roof_typ_desc', 'num_fireplaces',
       'num_kitchens', 'num_full_baths', 'num_half_baths', 'num_wet_bars',
       'num_bedrooms', 'sprinkler_sys_ind', 'pool_ind', 'area_size'],
      dtype='object')

In [None]:
res_2019_data.dtypes

In [None]:
# Plot histogram grid
res_2019_data.hist(figsize=(24,24), xrot=-45) ## Display the labels rotated by 45 degress

# Clear the text "residue"
plt.show()

In [None]:
# PLot Desirablilty
plt.figure(figsize=(8,8))
sns.countplot(y='desirability_rating', data=res_2019_data)

In [17]:
# Drop Null Values 
drop_res_2019_data = res_2019_data.dropna(how="all")

In [None]:
drop_res_2019_data.head()

In [None]:
#drop_res_2019_data.loc['00000788983000000']

In [None]:
plt.figure(figsize=(16,16))
sns.heatmap(drop_res_2019_data.corr())

In [18]:
# Create a DF using only existing interger columns 
int_df = drop_res_2019_data[['tot_val', 'decrease', 'tot_living_area_sf', 'num_kitchens', 'num_full_baths', 'num_half_baths', 'num_bedrooms', 'area_size']]
int_df.head()

Unnamed: 0_level_0,tot_val,decrease,tot_living_area_sf,num_kitchens,num_full_baths,num_half_baths,num_bedrooms,area_size
account_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
008222000M0130000,181500.0,0,1815,1,2,1,2,1234.0
00000574711000000,186540.0,0,1365,1,2,0,3,7255.0
00000120658000000,508910.0,0,2684,1,2,1,4,6153.0
00000184093000000,330380.0,0,1408,2,2,0,2,7785.0
00000450313000000,49310.0,0,672,1,1,0,2,6637.0


In [None]:
int_df.columns

In [19]:
# Create a DF using only existing object columns 
object_df = drop_res_2019_data[['pool_ind', 'sprinkler_sys_ind', 'foundation_typ_desc', 'heating_typ_desc', 'ac_typ_desc', 'ext_wall_desc', 'roof_typ_desc', 'desirability_rating', 'bldg_class_cd','nbhd_cd']]
object_df.head()

Unnamed: 0_level_0,pool_ind,sprinkler_sys_ind,foundation_typ_desc,heating_typ_desc,ac_typ_desc,ext_wall_desc,roof_typ_desc,desirability_rating,bldg_class_cd,nbhd_cd
account_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
008222000M0130000,N,Y,SLAB,CENTRAL FULL,CENTRAL FULL,STUCCO,HIP,EXCELLENT,TOWNHOME,2DS112
00000574711000000,N,N,PIER AND BEAM,CENTRAL FULL,CENTRAL FULL,VINYL,GABLE,GOOD,04,3DSG01
00000120658000000,N,N,PIER AND BEAM,CENTRAL FULL,CENTRAL FULL,FRAME,GABLE,GOOD,06,1DSM19
00000184093000000,N,N,BLOCK,GAS HEATERS,WINDOW,FRAME,GABLE,FAIR,03,1DSJ12
00000450313000000,N,N,BLOCK,CENTRAL FULL,CENTRAL FULL,FRAME,GABLE,FAIR,04,4DSA17


In [None]:
object_df.columns

### Get Dummies

In [20]:
# Dummify object dataframe 
dummy_df = pd.get_dummies(object_df)
dummy_df.head()

Unnamed: 0_level_0,pool_ind_N,pool_ind_Y,sprinkler_sys_ind_N,sprinkler_sys_ind_Y,foundation_typ_desc_BLOCK,foundation_typ_desc_BRICK,foundation_typ_desc_FOUNDATION SUPPORT,foundation_typ_desc_GIRDER ON THE GROUND,foundation_typ_desc_GRADE AND INTERIOR BEAM,foundation_typ_desc_IRON PIPE,...,nbhd_cd_5OSU13,nbhd_cd_5OSU14,nbhd_cd_5OSU15,nbhd_cd_5OSU16,nbhd_cd_5OSU17,nbhd_cd_5OSU18,nbhd_cd_5OSV01,nbhd_cd_5OSW01,nbhd_cd_5OSW02,nbhd_cd_5OSW03
account_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
008222000M0130000,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00000574711000000,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00000120658000000,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00000184093000000,1,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00000450313000000,1,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
dummy_df.columns

In [None]:
#dummy_df.dtypes #takes forever on the full dataset

### Create the Clean DF by merging int_df and dummy_df

In [21]:
clean_df = pd.merge(int_df, dummy_df, right_index=True, left_index=True, how="inner")
clean_df = clean_df.drop('pool_ind_N',axis=1)
clean_df = clean_df.drop('sprinkler_sys_ind_N',axis=1)
clean_df.head()

Unnamed: 0_level_0,tot_val,decrease,tot_living_area_sf,num_kitchens,num_full_baths,num_half_baths,num_bedrooms,area_size,pool_ind_Y,sprinkler_sys_ind_Y,...,nbhd_cd_5OSU13,nbhd_cd_5OSU14,nbhd_cd_5OSU15,nbhd_cd_5OSU16,nbhd_cd_5OSU17,nbhd_cd_5OSU18,nbhd_cd_5OSV01,nbhd_cd_5OSW01,nbhd_cd_5OSW02,nbhd_cd_5OSW03
account_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
107512000000,412330.0,0,1656,1,2,0,2,3540.0,0,1,...,0,0,0,0,0,0,0,0,0,0
107515000000,499200.0,0,3446,1,3,1,3,3960.0,1,0,...,0,0,0,0,0,0,0,0,0,0
107680100000,402020.0,0,2673,1,2,1,3,3639.0,0,1,...,0,0,0,0,0,0,0,0,0,0
107680120000,332220.0,0,1577,1,2,0,2,3397.0,0,1,...,0,0,0,0,0,0,0,0,0,0
107680140000,420920.0,0,2345,1,2,0,2,3397.0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
clean_df.describe()

In [22]:
clean_df = clean_df.dropna(how="any")

In [23]:
clean_df.shape

(651531, 5213)

### Create Dataframes to use with models 

In [24]:
df_rfr = clean_df
df_rfr.head()

Unnamed: 0_level_0,tot_val,decrease,tot_living_area_sf,num_kitchens,num_full_baths,num_half_baths,num_bedrooms,area_size,pool_ind_Y,sprinkler_sys_ind_Y,...,nbhd_cd_5OSU13,nbhd_cd_5OSU14,nbhd_cd_5OSU15,nbhd_cd_5OSU16,nbhd_cd_5OSU17,nbhd_cd_5OSU18,nbhd_cd_5OSV01,nbhd_cd_5OSW01,nbhd_cd_5OSW02,nbhd_cd_5OSW03
account_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
107512000000,412330.0,0,1656,1,2,0,2,3540.0,0,1,...,0,0,0,0,0,0,0,0,0,0
107515000000,499200.0,0,3446,1,3,1,3,3960.0,1,0,...,0,0,0,0,0,0,0,0,0,0
107680100000,402020.0,0,2673,1,2,1,3,3639.0,0,1,...,0,0,0,0,0,0,0,0,0,0
107680120000,332220.0,0,1577,1,2,0,2,3397.0,0,1,...,0,0,0,0,0,0,0,0,0,0
107680140000,420920.0,0,2345,1,2,0,2,3397.0,0,1,...,0,0,0,0,0,0,0,0,0,0


<h1><center>Machine Learning Models</center></h1>

## Random Forest Regressor

In [None]:
df_rfr.head()

In [25]:
# Create separate object for target variable
y = df_rfr.tot_val
# Create separate object for input features
X = df_rfr.drop('tot_val', axis=1)

In [26]:
# Split X and y into train and test sets: 80-20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# train_mean = X_train.mean()
# train_std = X_train.std()

In [None]:
## Standardize the train data set
#X_train = (X_train - train_mean) / train_std

In [None]:
#X_train.describe()

In [None]:
## Note: We use train_mean and train_std_dev to standardize test data set
#X_test = (X_test - train_mean) / train_std

In [None]:
## Check for mean and std dev. - not exactly 0 and 1
#X_test.describe()

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [28]:
## Reference for random search on random forest
## https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
tuned_params = {'n_estimators': [100, 200, 300, 400, 500], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}
model = RandomizedSearchCV(RandomForestRegressor(), tuned_params, n_iter=20, scoring = 'neg_mean_absolute_error', cv=5, n_jobs=6)
model.fit(X_train, y_train)
## This takes around 15 minutes



MemoryError: 

In [None]:
model.best_estimator_

In [None]:
## Predict Train results
y_train_pred = model.predict(X_train)

In [None]:
## Predict Test results
y_test_pred = model.predict(X_test)

In [None]:
#y_test_pred

In [None]:
print("Train Results for Random Forest Regression:")
print("*******************************")
print("Root mean squared error: ", sqrt(mse(y_train.values, y_train_pred)))
print("R-squared: ", rs(y_train.values, y_train_pred))
print("Mean Absolute Error: ", mae(y_train.values, y_train_pred))

In [None]:
print("Test Results for Random Forest Regression:")
print("*******************************")
print("Root mean squared error: ", sqrt(mse(y_test, y_test_pred)))
print("R-squared: ", rs(y_test, y_test_pred))
print("Mean Absolute Error: ", mae(y_test, y_test_pred))

In [None]:
## Building the model again with the best hyperparameters
model = RandomForestRegressor(n_estimators=200, min_samples_split=10, min_samples_leaf=2)
model.fit(X_train, y_train)

In [None]:
indices = np.argsort(-model.feature_importances_)
print("The features in order of importance are:")
print(50*'-')
for feature in X.columns[indices]:
    print(feature)

In [None]:
model.fit(X_train, y_train)
with open('rfr_prop_tax.model', 'wb') as f:
       joblib.dump(model, 'rfr_prop_tax.model')

## Run Saved Model

In [None]:
loaded_model = joblib.load('rfr_prop_tax.model')
result = loaded_model.score(X, y)
print(result)

In [None]:
loaded_model.fit(X, y)

In [None]:
tot_val_pred = model.predict(X)

In [None]:
#tot_val_pred

In [None]:
df_rfr['tot_val_pred'] = tot_val_pred.round(decimals=0, out=None)

In [None]:
df_rfr[['tot_val', 'tot_val_pred']].head(100)

In [None]:
#empty existing table
conn.execute('truncate table predicted_values')

In [None]:
#insert predicted values in database
df_rfr[['tot_val', 'tot_val_pred']].to_sql('predicted_values', conn)