In [8]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

In [22]:
# Read CSV file
data = pd.read_csv('melb_data.csv')

# Drop useless features
data.drop(['Address', 'Lattitude', 'Longtitude'], axis=1, inplace=True)

# Convert Date to Year and Month
data['Date'] = pd.to_datetime(data['Date'], dayfirst=True)
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data.drop('Date', axis=1, inplace=True)

categorical_cols = ['Suburb', 'Type', 'Method', 'SellerG', 'Regionname', 'CouncilArea', 'Postcode', 'Propertycount', 'Rooms', 'Year', 'Month', 'YearBuilt']
# Replace categorical NA with -
for col in data.columns:
    if col in categorical_cols:
        if col == 'YearBuilt':
            data[col].fillna(0, inplace=True)
        else:
            data[col].fillna('-', inplace=True)
# Replace numerical NA with mean
for col in data.columns:
    if col not in categorical_cols:
        data[col].fillna(data[col].mean(), inplace=True)

data.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Method,SellerG,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Regionname,Propertycount,Year,Month
0,Abbotsford,2,h,1480000.0,S,Biggin,2.5,3067.0,2.0,1.0,1.0,202.0,151.96765,0.0,Yarra,Northern Metropolitan,4019.0,2016,12
1,Abbotsford,2,h,1035000.0,S,Biggin,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra,Northern Metropolitan,4019.0,2016,2
2,Abbotsford,3,h,1465000.0,SP,Biggin,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra,Northern Metropolitan,4019.0,2017,3
3,Abbotsford,3,h,850000.0,PI,Biggin,2.5,3067.0,3.0,2.0,1.0,94.0,151.96765,0.0,Yarra,Northern Metropolitan,4019.0,2017,3
4,Abbotsford,4,h,1600000.0,VB,Nelson,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra,Northern Metropolitan,4019.0,2016,6


In [23]:
mappings = {}
# Convert categorical features to numeric
for col in categorical_cols:
    le = LabelEncoder()
    le.fit(data[col])
    mappings[col] = dict(zip(le.classes_, le.transform(le.classes_)))
    data[col] = le.transform(data[col])
data

# Pickle mappings
import pickle
with open('mappings.pkl', 'wb') as f:
    pickle.dump(mappings, f)


In [11]:
X_scaler = StandardScaler()
X_scaled = X_scaler.fit_transform(data.drop('Price', axis=1))
X_train, X_test, y_train, y_test = train_test_split(X_scaled, data['Price'], test_size=0.2, random_state=42)
print ("X_train.shape = ", X_train.shape)
print ("y_train.shape = ", y_train.shape)
print ("X_val.shape = ", X_test.shape)
print ("y_val.shape = ", y_test.shape)

X_train.shape =  (10864, 18)
y_train.shape =  (10864,)
X_val.shape =  (2716, 18)
y_val.shape =  (2716,)


In [12]:
def custom_loss_function(y_true, y_pred):    
    y_diff = tf.abs(tf.math.log(1+y_true) - tf.math.log(1+y_pred))    
    return tf.reduce_sum(tf.square(y_diff)) / tf.cast(tf.size(y_diff), tf.float32)

In [13]:
model2 = tf.keras.Sequential([
    layers.Dense(1024, activation='relu'),#, input_shape=[len(X_train.keys())]),
    layers.Dense(512, activation='relu'),
    layers.Dense(256, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)
])
model2.compile(optimizer='adam', loss=custom_loss_function, metrics=['mae', 'mse'])

In [14]:
model2.fit(X_train, y_train, epochs=100, batch_size=16, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x1e8c0e6ed60>

In [15]:
y_pred = model2.predict(X_test)
print(y_test.to_numpy(), y_pred)

[2600000.  620000. 1000000. ... 2450000. 1155000. 1040000.] [[2164712.8 ]
 [ 636383.06]
 [ 723843.  ]
 ...
 [2567189.2 ]
 [1505668.4 ]
 [ 764946.5 ]]


In [16]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(100,activation='relu'))
model.add(tf.keras.layers.Dense(50,activation='sigmoid'))
model.add(tf.keras.layers.Dense(50,activation='relu'))
model.add(tf.keras.layers.Dense(10,activation='relu'))
model.add(tf.keras.layers.Dense(10,activation='relu'))
model.add(tf.keras.layers.Dense(1))

In [17]:
model.compile(optimizer='adam',
              loss=custom_loss_function)
history = model.fit(X_train,y_train,
                    batch_size=32,
                    epochs=250,
                    validation_data=(X_test,y_test))

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

In [18]:
y_pred = model.predict(X_test).flatten()
print(y_test.to_numpy(), y_pred)

# calculate MAE of the model
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)
print('MAE: %.3f' % mae)

[2600000.  620000. 1000000. ... 2450000. 1155000. 1040000.] [1928604.5   650264.56  961454.56 ... 2438927.8  1373157.    815144.3 ]
MAE: 189003.553


In [19]:
model.save('Model1.h5')
model2.save('Model2.h5')