# Imports

In [1]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA


In [2]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
os.chdir('/content/drive/MyDrive/UCF Data Analytics Course/Project 4/Property_Value_NN_Estimator')

# Read and Clean Data

In [4]:
# import data
redfin_df = pd.read_csv("cleaned_data.csv")
redfin_df.head()

Unnamed: 0,full_address,beds,baths,year_built,sq_ft_interior,sq_ft_lot,price,lat,long,zip_code
0,2326 LUAU ST MESQUITE TX 75150,3,1.5,1966,1356,8303,241183,32.798774,-96.649097,75150
1,1737 HIGHLAND ST MESQUITE TX 75149,3,1.0,1955,1454,7501,218014,32.787427,-96.609647,75149
2,2900 SIDNEY DR MESQUITE TX 75150,3,2.0,1972,1392,7196,241089,32.817247,-96.644125,75150
3,1507 RICHARD ST MESQUITE TX 75149,3,1.0,1957,1313,7501,205940,32.787413,-96.616575,75149
4,806 BRANDON DR SEAGOVILLE TX 75159,5,2.0,2004,1504,8407,275611,32.661589,-96.547004,75159


In [5]:
redfin_copy_df = redfin_df.copy()

In [6]:
# drop columns
redfin_copy_df = redfin_copy_df.drop(columns=['full_address', 'lat', 'long'])
redfin_copy_df.head()

Unnamed: 0,beds,baths,year_built,sq_ft_interior,sq_ft_lot,price,zip_code
0,3,1.5,1966,1356,8303,241183,75150
1,3,1.0,1955,1454,7501,218014,75149
2,3,2.0,1972,1392,7196,241089,75150
3,3,1.0,1957,1313,7501,205940,75149
4,5,2.0,2004,1504,8407,275611,75159


In [7]:
redfin_copy_df.nunique()

beds                 12
baths                22
year_built          124
sq_ft_interior     3944
sq_ft_lot          5680
price             31895
zip_code             82
dtype: int64

In [8]:
redfin_copy_df.dtypes

beds                int64
baths             float64
year_built          int64
sq_ft_interior      int64
sq_ft_lot           int64
price               int64
zip_code            int64
dtype: object

In [9]:
# zip needs to be treated as categorical
redfin_copy_df['zip_code'] = redfin_copy_df['zip_code'].astype(str)
redfin_copy_df.dtypes

beds                int64
baths             float64
year_built          int64
sq_ft_interior      int64
sq_ft_lot           int64
price               int64
zip_code           object
dtype: object

In [10]:
# convert categorical data
redfin_copy_df = pd.get_dummies(redfin_copy_df)
redfin_copy_df.head()

Unnamed: 0,beds,baths,year_built,sq_ft_interior,sq_ft_lot,price,zip_code_75001,zip_code_75006,zip_code_75007,zip_code_75019,...,zip_code_75240,zip_code_75241,zip_code_75243,zip_code_75244,zip_code_75246,zip_code_75248,zip_code_75249,zip_code_75252,zip_code_75253,zip_code_75254
0,3,1.5,1966,1356,8303,241183,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,1.0,1955,1454,7501,218014,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,2.0,1972,1392,7196,241089,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1.0,1957,1313,7501,205940,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,2.0,2004,1504,8407,275611,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Split, Scale, Compile for NN model

In [11]:
# split preprocessed data into features and targets
y = redfin_copy_df['price'].values
X = redfin_copy_df.drop(columns='price').values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [12]:
# create the scaler
scaler = StandardScaler()

# fit the scaler
X_scaler = scaler.fit(X_train)

# scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Optional PCA method

In [13]:
# # Instantiate the PCA instance with 3 PCA variables
# pca = PCA(n_components=3)

# # Fit the PCA model on the transformed credit card DataFrame
# pca_X = pca.fit_transform(df_encoded)

# # Review the first 5 rows of list data
# pca_X[:5]

# Set up HyperParameter tuning process

In [14]:
# Install keras-tuner for hyperparamter tuning later on
# Uncomment when using in Google Colab
!pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.6-py3-none-any.whl (128 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.9/128.9 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.6 kt-legacy-1.0.5


In [15]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','linear','softplus'])
    # activation2 = hp.Choice('activation',['relu','tanh','linear','softplus'])
    activation2 = ['relu','tanh','linear','softplus']


    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=10,
        step=2), activation=activation, input_dim=87))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 20)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=20,
            step=2),
            activation=activation))

    nn_model.add(tf.keras.layers.Dense(units=1, activation=activation2[3]))

    # Compile the model
    nn_model.compile(loss="mean_absolute_percentage_error", optimizer='adam', metrics=[tf.keras.metrics.RootMeanSquaredError(), 'mae'])

    return nn_model

In [16]:
# Import the kerastuner library
import keras_tuner as kt

# Create tuner and establish parameters
tuner = kt.Hyperband(
    create_model,
    objective="val_loss",
    max_epochs=30,
    hyperband_iterations=2)

In [17]:
best_model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath='best_model.h5',
    monitor='val_loss',
    mode='min',
    save_best_only=True
)

In [18]:
# WARNING: THIS WILL TAKE A LONG TIME. LAST RUN WAS 1H 46 MINUTES.
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled, y_train, epochs=20 ,validation_data=(X_test_scaled,y_test))

Trial 180 Complete [00h 01m 24s]
val_loss: 27.165138244628906

Best val_loss So Far: 9.161386489868164
Total elapsed time: 01h 46m 29s


In [19]:
# Get top 3 model hyperparameters and print the values
top_hyper = tuner.get_best_hyperparameters(3)
for param in top_hyper:
    print(param.values)

{'activation': 'softplus', 'first_units': 5, 'num_layers': 20, 'units_0': 15, 'units_1': 11, 'units_2': 13, 'units_3': 17, 'units_4': 5, 'units_5': 1, 'units_6': 17, 'units_7': 15, 'units_8': 9, 'units_9': 19, 'units_10': 1, 'units_11': 1, 'units_12': 3, 'units_13': 7, 'units_14': 19, 'units_15': 5, 'units_16': 17, 'units_17': 7, 'units_18': 17, 'units_19': 11, 'tuner/epochs': 30, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}
{'activation': 'softplus', 'first_units': 5, 'num_layers': 20, 'units_0': 15, 'units_1': 13, 'units_2': 5, 'units_3': 9, 'units_4': 17, 'units_5': 11, 'units_6': 7, 'units_7': 1, 'units_8': 13, 'units_9': 1, 'units_10': 1, 'units_11': 11, 'units_12': 7, 'units_13': 9, 'units_14': 17, 'units_15': 19, 'units_16': 3, 'units_17': 15, 'units_18': 19, 'units_19': 11, 'tuner/epochs': 30, 'tuner/initial_epoch': 10, 'tuner/bracket': 3, 'tuner/round': 3, 'tuner/trial_id': '0047'}
{'activation': 'softplus', 'first_units': 9, 'num_layers': 19, 'units_0': 1, 

# Set up Model Using tuned parameters

In [27]:
# {'activation': 'relu', 'first_units': 7, 'num_layers': 15, 'units_0': 5, 'units_1': 19, 'units_2': 15, 'units_3': 3, 'units_4': 11, 'units_5': 1, 'units_6': 15, 'units_7': 13, 'units_8': 7, 'units_9': 15, 'units_10': 13, 'units_11': 15, 'units_12': 9, 'units_13': 19, 'units_14': 11, 'units_15': 13, 'units_16': 11, 'units_17': 9, 'units_18': 7, 'units_19': 3, 'tuner/epochs': 30, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}

# {'activation': 'softplus',
# 'first_units': 5,
# 'num_layers': 20,
# 'units_0': 15,
# 'units_1': 11,
# 'units_2': 13,
# 'units_3': 17,
# 'units_4': 5,
# 'units_5': 1,
# 'units_6': 17,
# 'units_7': 15,
# 'units_8': 9,
# 'units_9': 19,
# 'units_10': 1,
# 'units_11': 1,
# 'units_12': 3,
# 'units_13': 7,
# 'units_14': 19,
# 'units_15': 5,
# 'units_16': 17,
# 'units_17': 7,
# 'units_18': 17,
# 'units_19': 11,
# 'tuner/epochs': 30, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}

input_dim = X_train.shape[1]

nn = tf.keras.models.Sequential()

# first hidden layer
nn.add(tf.keras.layers.Dense(units=15, activation='softplus', input_dim=input_dim))

# 1-20 hidden layer
nn.add(tf.keras.layers.Dense(units=11, activation='softplus'))
nn.add(tf.keras.layers.Dense(units=13, activation='softplus'))
nn.add(tf.keras.layers.Dense(units=17, activation='softplus'))
nn.add(tf.keras.layers.Dense(units=5, activation='softplus'))
nn.add(tf.keras.layers.Dense(units=1, activation='softplus'))
nn.add(tf.keras.layers.Dense(units=17, activation='softplus'))
nn.add(tf.keras.layers.Dense(units=15, activation='softplus'))
nn.add(tf.keras.layers.Dense(units=9, activation='softplus'))
nn.add(tf.keras.layers.Dense(units=19, activation='softplus'))
nn.add(tf.keras.layers.Dense(units=1, activation='softplus'))
nn.add(tf.keras.layers.Dense(units=1, activation='softplus'))
nn.add(tf.keras.layers.Dense(units=3, activation='softplus'))
nn.add(tf.keras.layers.Dense(units=7, activation='softplus'))
nn.add(tf.keras.layers.Dense(units=19, activation='softplus'))
nn.add(tf.keras.layers.Dense(units=5, activation='softplus'))
nn.add(tf.keras.layers.Dense(units=17, activation='softplus'))
nn.add(tf.keras.layers.Dense(units=7, activation='softplus'))
nn.add(tf.keras.layers.Dense(units=17, activation='softplus'))

# Output layer
nn.add(tf.keras.layers.Dense(units=11, activation='softplus')) # linear relu softplus tanh

nn.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_28 (Dense)            (None, 15)                1320      
                                                                 
 dense_29 (Dense)            (None, 11)                176       
                                                                 
 dense_30 (Dense)            (None, 13)                156       
                                                                 
 dense_31 (Dense)            (None, 17)                238       
                                                                 
 dense_32 (Dense)            (None, 5)                 90        
                                                                 
 dense_33 (Dense)            (None, 1)                 6         
                                                                 
 dense_34 (Dense)            (None, 17)               

In [28]:
# compile the model
nn.compile(loss='mean_absolute_percentage_error', optimizer='adam', metrics=[tf.keras.metrics.RootMeanSquaredError(), 'mae'])

In [29]:
fit_model = nn.fit(X_train_scaled, y_train, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [30]:
predictions = nn.predict(X_test)
predictions



array([[ 134324.12,  134351.39,  134352.25, ...,  134347.61,  134349.8 ,
         134340.81],
       [ 134324.12,  134351.39,  134352.25, ...,  134347.61,  134349.8 ,
         134340.81],
       [ 134324.12,  134351.39,  134352.25, ...,  134347.61,  134349.8 ,
         134340.81],
       ...,
       [ 134324.12,  134351.39,  134352.25, ...,  134347.61,  134349.8 ,
         134340.81],
       [ 134324.11,  134351.4 ,  134352.25, ...,  134347.61,  134349.8 ,
         134340.83],
       [2717436.  , 2717991.5 , 2717992.5 , ..., 2717917.2 , 2717946.  ,
        2717784.  ]], dtype=float32)

# Set up Model (original)

In [24]:
input_dim = X_train.shape[1]

nn = tf.keras.models.Sequential()

# first hidden layer
nn.add(tf.keras.layers.Dense(units=10, activation='relu', input_dim=input_dim))

# second hidden layer
nn.add(tf.keras.layers.Dense(units=5, activation='relu'))

# third hidden layer
nn.add(tf.keras.layers.Dense(units=5, activation='softplus')) # linear relu softplus tanh

nn.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_25 (Dense)            (None, 10)                880       
                                                                 
 dense_26 (Dense)            (None, 5)                 55        
                                                                 
 dense_27 (Dense)            (None, 5)                 30        
                                                                 
Total params: 965 (3.77 KB)
Trainable params: 965 (3.77 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [25]:
# compile the model
nn.compile(loss='mean_absolute_percentage_error', optimizer='adam', metrics=[tf.keras.metrics.RootMeanSquaredError(), 'mae'])

In [26]:
fit_model = nn.fit(X_train_scaled, y_train, epochs = 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
