# Imports

In [3]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA


In [4]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [8]:
os.chdir('/content/drive/MyDrive/project_4/Property_Value_NN_Estimator')

# Read and Clean Data

In [14]:
# import data
redfin_df = pd.read_csv("cleaned_data.csv")
redfin_df.head()

Unnamed: 0,full_address,beds,baths,year_built,sq_ft_interior,sq_ft_lot,price,lat,long,zip_code
0,2326 LUAU ST MESQUITE TX 75150,3,1.5,1966,1356,8303,241183,32.798774,-96.649097,75150
1,1737 HIGHLAND ST MESQUITE TX 75149,3,1.0,1955,1454,7501,218014,32.787427,-96.609647,75149
2,2900 SIDNEY DR MESQUITE TX 75150,3,2.0,1972,1392,7196,241089,32.817247,-96.644125,75150
3,1507 RICHARD ST MESQUITE TX 75149,3,1.0,1957,1313,7501,205940,32.787413,-96.616575,75149
4,806 BRANDON DR SEAGOVILLE TX 75159,5,2.0,2004,1504,8407,275611,32.661589,-96.547004,75159


In [15]:
redfin_copy_df = redfin_df.copy()

In [16]:
# drop columns
redfin_copy_df = redfin_copy_df.drop(columns=['full_address', 'lat', 'long'])
redfin_copy_df.head()

Unnamed: 0,beds,baths,year_built,sq_ft_interior,sq_ft_lot,price,zip_code
0,3,1.5,1966,1356,8303,241183,75150
1,3,1.0,1955,1454,7501,218014,75149
2,3,2.0,1972,1392,7196,241089,75150
3,3,1.0,1957,1313,7501,205940,75149
4,5,2.0,2004,1504,8407,275611,75159


In [17]:
redfin_copy_df.nunique()

beds                 12
baths                22
year_built          124
sq_ft_interior     3944
sq_ft_lot          5680
price             31895
zip_code             82
dtype: int64

In [18]:
redfin_copy_df.dtypes

beds                int64
baths             float64
year_built          int64
sq_ft_interior      int64
sq_ft_lot           int64
price               int64
zip_code            int64
dtype: object

In [19]:
# zip needs to be treated as categorical
redfin_copy_df['zip_code'] = redfin_copy_df['zip_code'].astype(str)
redfin_copy_df.dtypes

beds                int64
baths             float64
year_built          int64
sq_ft_interior      int64
sq_ft_lot           int64
price               int64
zip_code           object
dtype: object

In [20]:
# convert categorical data
redfin_copy_df = pd.get_dummies(redfin_copy_df)
redfin_copy_df.head()

Unnamed: 0,beds,baths,year_built,sq_ft_interior,sq_ft_lot,price,zip_code_75001,zip_code_75006,zip_code_75007,zip_code_75019,...,zip_code_75240,zip_code_75241,zip_code_75243,zip_code_75244,zip_code_75246,zip_code_75248,zip_code_75249,zip_code_75252,zip_code_75253,zip_code_75254
0,3,1.5,1966,1356,8303,241183,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,1.0,1955,1454,7501,218014,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,2.0,1972,1392,7196,241089,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1.0,1957,1313,7501,205940,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,2.0,2004,1504,8407,275611,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Split, Scale, Compile for NN model

In [21]:
# split preprocessed data into features and targets
y = redfin_copy_df['price'].values
X = redfin_copy_df.drop(columns='price').values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [22]:
# create the scaler
scaler = StandardScaler()

# fit the scaler
X_scaler = scaler.fit(X_train)

# scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Optional PCA method

In [23]:
# # Instantiate the PCA instance with 3 PCA variables
# pca = PCA(n_components=3)

# # Fit the PCA model on the transformed credit card DataFrame
# pca_X = pca.fit_transform(df_encoded)

# # Review the first 5 rows of list data
# pca_X[:5]

# Set up HyperParameter tuning process

In [25]:
# Install keras-tuner for hyperparamter tuning later on
# Uncomment when using in Google Colab
# !pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.6-py3-none-any.whl (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.9/128.9 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.6 kt-legacy-1.0.5


In [52]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','linear','softplus'])
    # activation2 = hp.Choice('activation',['relu','tanh','linear','softplus'])
    activation2 = ['relu','tanh','linear','softplus']


    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=10,
        step=2), activation=activation, input_dim=87))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 20)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=20,
            step=2),
            activation=activation))

    nn_model.add(tf.keras.layers.Dense(units=1, activation=activation2[3]))

    # Compile the model
    nn_model.compile(loss="mean_absolute_percentage_error", optimizer='adam', metrics=[tf.keras.metrics.RootMeanSquaredError(), 'mae'])

    return nn_model

In [53]:
# Import the kerastuner library
import keras_tuner as kt

# Create tuner and establish parameters
tuner = kt.Hyperband(
    create_model,
    objective="val_loss",
    max_epochs=30,
    hyperband_iterations=2)

Reloading Tuner from ./untitled_project/tuner0.json


In [54]:
best_model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath='best_model.h5',
    monitor='val_loss',
    mode='min',
    save_best_only=True
)

In [None]:
# WARNING: THIS WILL TAKE A LONG TIME. LAST RUN WAS 44 MINUTES.
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled, y_train, epochs=20 ,validation_data=(X_test_scaled,y_test))

In [61]:
# Get top 3 model hyperparameters and print the values
top_hyper = tuner.get_best_hyperparameters(3)
for param in top_hyper:
    print(param.values)

{'activation': 'relu', 'first_units': 7, 'num_layers': 15, 'units_0': 5, 'units_1': 19, 'units_2': 15, 'units_3': 3, 'units_4': 11, 'units_5': 1, 'units_6': 15, 'units_7': 13, 'units_8': 7, 'units_9': 15, 'units_10': 13, 'units_11': 15, 'units_12': 9, 'units_13': 19, 'units_14': 11, 'units_15': 13, 'units_16': 11, 'units_17': 9, 'units_18': 7, 'units_19': 3, 'tuner/epochs': 30, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}
{'activation': 'softplus', 'first_units': 9, 'num_layers': 17, 'units_0': 5, 'units_1': 7, 'units_2': 17, 'units_3': 9, 'units_4': 1, 'units_5': 11, 'units_6': 5, 'units_7': 19, 'units_8': 1, 'units_9': 17, 'units_10': 3, 'units_11': 5, 'units_12': 13, 'units_13': 17, 'units_14': 5, 'tuner/epochs': 10, 'tuner/initial_epoch': 4, 'tuner/bracket': 3, 'tuner/round': 2, 'units_15': 1, 'units_16': 1, 'tuner/trial_id': '0034', 'units_17': 7, 'units_18': 5, 'units_19': 15}
{'activation': 'relu', 'first_units': 5, 'num_layers': 17, 'units_0': 3, 'units_1': 9

# Set up Model Using tuned parameters

In [64]:
# {'activation': 'relu', 'first_units': 7, 'num_layers': 15, 'units_0': 5, 'units_1': 19, 'units_2': 15, 'units_3': 3, 'units_4': 11, 'units_5': 1, 'units_6': 15, 'units_7': 13, 'units_8': 7, 'units_9': 15, 'units_10': 13, 'units_11': 15, 'units_12': 9, 'units_13': 19, 'units_14': 11, 'units_15': 13, 'units_16': 11, 'units_17': 9, 'units_18': 7, 'units_19': 3, 'tuner/epochs': 30, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}

input_dim = X_train.shape[1]

nn = tf.keras.models.Sequential()

# first hidden layer
nn.add(tf.keras.layers.Dense(units=7, activation='relu', input_dim=input_dim))

# 1-20 hidden layer
nn.add(tf.keras.layers.Dense(units=5, activation='relu'))
nn.add(tf.keras.layers.Dense(units=19, activation='relu'))
nn.add(tf.keras.layers.Dense(units=15, activation='relu'))
nn.add(tf.keras.layers.Dense(units=3, activation='relu'))
nn.add(tf.keras.layers.Dense(units=11, activation='relu'))
nn.add(tf.keras.layers.Dense(units=1, activation='relu'))
nn.add(tf.keras.layers.Dense(units=15, activation='relu'))
nn.add(tf.keras.layers.Dense(units=13, activation='relu'))
nn.add(tf.keras.layers.Dense(units=7, activation='relu'))
nn.add(tf.keras.layers.Dense(units=15, activation='relu'))
nn.add(tf.keras.layers.Dense(units=13, activation='relu'))
nn.add(tf.keras.layers.Dense(units=15, activation='relu'))
nn.add(tf.keras.layers.Dense(units=9, activation='relu'))
nn.add(tf.keras.layers.Dense(units=19, activation='relu'))
nn.add(tf.keras.layers.Dense(units=11, activation='relu'))
nn.add(tf.keras.layers.Dense(units=13, activation='relu'))
nn.add(tf.keras.layers.Dense(units=11, activation='relu'))
nn.add(tf.keras.layers.Dense(units=9, activation='relu'))
nn.add(tf.keras.layers.Dense(units=7, activation='relu'))
nn.add(tf.keras.layers.Dense(units=3, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='softplus')) # linear relu softplus tanh

nn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_17 (Dense)            (None, 7)                 616       
                                                                 
 dense_18 (Dense)            (None, 5)                 40        
                                                                 
 dense_19 (Dense)            (None, 19)                114       
                                                                 
 dense_20 (Dense)            (None, 15)                300       
                                                                 
 dense_21 (Dense)            (None, 3)                 48        
                                                                 
 dense_22 (Dense)            (None, 11)                44        
                                                                 
 dense_23 (Dense)            (None, 1)                

In [65]:
# compile the model
nn.compile(loss='mean_absolute_percentage_error', optimizer='adam', metrics=[tf.keras.metrics.RootMeanSquaredError(), 'mae'])

In [66]:
fit_model = nn.fit(X_train_scaled, y_train, epochs = 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [67]:
predictions = nn.predict(X_test)
predictions



array([[3.9319392e+08],
       [4.3752694e+08],
       [1.8816237e+09],
       ...,
       [3.7605011e+08],
       [4.0054739e+08],
       [5.0747232e+08]], dtype=float32)

# Set up Model (original)

In [None]:
input_dim = X_train.shape[1]

nn = tf.keras.models.Sequential()

# first hidden layer
nn.add(tf.keras.layers.Dense(units=10, activation='relu', input_dim=input_dim))

# second hidden layer
nn.add(tf.keras.layers.Dense(units=5, activation='relu'))

# third hidden layer
nn.add(tf.keras.layers.Dense(units=5, activation='softplus')) # linear relu softplus tanh

nn.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_15 (Dense)            (None, 10)                880       
                                                                 
 dense_16 (Dense)            (None, 5)                 55        
                                                                 
 dense_17 (Dense)            (None, 5)                 30        
                                                                 
Total params: 965 (3.77 KB)
Trainable params: 965 (3.77 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# compile the model
nn.compile(loss='mean_absolute_percentage_error', optimizer='adam', metrics=[tf.keras.metrics.RootMeanSquaredError(), 'mae'])

In [None]:
fit_model = nn.fit(X_train_scaled, y_train, epochs = 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
