# Building a Tensorflow predictive model based on housing price data for Shanghai

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import warnings
%matplotlib inline

warnings.filterwarnings("ignore", category=DeprecationWarning) 
tf.logging.set_verbosity(tf.logging.ERROR)

  from ._conv import register_converters as _register_converters


In [8]:
path = '../data/price_data.csv'

raw_data = pd.read_csv(path)
raw_data = raw_data.dropna(axis = 0, how = 'any')
raw_data.room = raw_data.room.astype('int64')
raw_data.renovation = raw_data.renovation.astype('int64')
raw_data.lvgroom = raw_data.lvgroom.astype('int64')
raw_data.sqm = raw_data.sqm.astype('float64')
# raw_data.floor = raw_data.floor.astype('category')

In [13]:
raw_data.room

2        1
4        3
5        1
6        1
7        2
9        1
10       1
11       2
12       2
14       2
16       3
21       3
22       4
23       3
24       1
25       2
29       1
30       2
33       2
35       1
36       1
37       2
39       3
41       2
43       1
44       2
46       3
47       2
48       2
49       2
        ..
13652    4
13654    3
13656    3
13657    2
13658    1
13661    3
13664    2
13665    2
13666    3
13667    2
13668    1
13670    2
13673    2
13674    1
13676    3
13678    2
13679    3
13680    3
13683    5
13684    5
13686    4
13698    2
13703    4
13704    4
13705    4
13706    3
13708    3
13710    2
13711    3
13712    2
Name: room, dtype: int64

### Getting statistical values for each variable

In [9]:
raw_data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,7642.0,7214.722717,4382.403296,2.0,3520.5,6841.5,11184.5,15184.0
sqm,7642.0,107.751112,61.923839,8.0,64.0,97.0,135.0,731.0
room,7642.0,2.246009,0.934408,1.0,2.0,2.0,3.0,11.0
lvgroom,7642.0,1.593562,0.616656,0.0,1.0,2.0,2.0,4.0
year,7642.0,2000.418477,9.86445,1918.0,1996.0,2003.0,2006.0,2017.0
renovation,7642.0,2.767208,0.708619,1.0,2.0,3.0,3.0,4.0
price,7642.0,12337.277807,11213.236025,1000.0,5500.0,9000.0,15000.0,200000.0
lat,7642.0,31.208627,0.086148,30.707683,31.194303,31.221302,31.241236,31.407696
lng,7642.0,121.481631,0.073207,121.103223,121.434194,121.492974,121.534738,121.71109
postal,7642.0,200495.652185,666.868666,200000.0,200085.0,200125.0,201112.0,215332.0


In [10]:
from sklearn.model_selection import train_test_split as tts

x_data = raw_data.drop(['price'],axis=1)
y_data = raw_data['price']

x_train, x_test, y_train, y_test = tts(x_data, y_data, test_size=0.3, random_state=101)

In [20]:
from sklearn.preprocessing import MinMaxScaler

warnings.filterwarnings("ignore", category=DeprecationWarning) 

scaler = MinMaxScaler()

numeric_columns = ['sqm', 'year', 'lat', 'lng', 'dist_1',]

for item in numeric_columns:
    print(item)
    scaler.fit(x_train[item].reshape(-1, 1))
    scaler.fit(x_train[item].reshape(-1, 1))
    x_train[item] = scaler.transform(x_train[item].reshape(-1, 1))
    x_test[item] = scaler.transform(x_test[item].reshape(-1, 1))

sqm
year
lat


  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


lng
dist_1


## Features

Starting off with seven features, later adding metro stations as one

In [21]:
#tf.feature_column.embedding_column('room', dimension = len(raw_data))

f1 = tf.feature_column.numeric_column('sqm')
f2 = tf.feature_column.numeric_column('year')
f3 = tf.feature_column.numeric_column('dist_1')
f4 = tf.feature_column.categorical_column_with_hash_bucket("room", hash_bucket_size=300)
f4 = tf.feature_column.categorical_column_with_hash_bucket(key = 'room', hash_bucket_size = 300, dtype = tf.int64)
f5 = tf.feature_column.categorical_column_with_hash_bucket(key = 'renovation', hash_bucket_size = 100, dtype = tf.int64)
f6 = tf.feature_column.categorical_column_with_hash_bucket(key = 'lvgroom', hash_bucket_size = 100, dtype = tf.int64)
f7 = tf.feature_column.categorical_column_with_hash_bucket(key = 'floor', hash_bucket_size = 100, dtype = tf.string)

numeric_columns = [f1, f2, f3]

embedded_columns = [tf.feature_column.embedding_column(f4, 1),
           tf.feature_column.embedding_column(f5, 1),
           tf.feature_column.embedding_column(f6, 1),
           tf.feature_column.embedding_column(f7, 1)]

feature_columns = numeric_columns + embedded_columns

### Defining model functions

In [22]:
def input_func(batch_size, num_epochs):
    return tf.estimator.inputs.pandas_input_fn(
    x = x_train,
    y = y_train,
    batch_size = batch_size,
    num_epochs = num_epochs,
    shuffle = True
    )

In [23]:
def DNN_model(hidden_units, activation_fn, feature_columns):

    return tf.estimator.DNNRegressor(
    hidden_units = hidden_units,
    feature_columns = feature_columns,
    activation_fn = activation_fn,
    )

In [24]:
predict_func = tf.estimator.inputs.pandas_input_fn(
        x = x_test,
        batch_size = 10,
        num_epochs = 1,
        shuffle = False
    )

In [25]:
def rmse(predictions):
    
    final_preds = []
    for pred in predictions:
        final_preds.append(pred['predictions'])
    
    return mean_squared_error(y_test,final_preds)**0.5

## Comparing different activation functions

### CReLU

In [26]:
model_A1 = DNN_model([7,14,28,14,7], tf.nn.crelu, feature_columns)

model_A1.train(
    input_fn = input_func(100,3000), 
    steps = 5000,
    )

pred_gen = model_A1.predict(predict_func)
predictions_A1 = list(pred_gen)
rmse(predictions_A1)

6153.853786820655

### Softplus

In [27]:
model_A2 = DNN_model([7,14,28,14,7], tf.nn.softplus, feature_columns)

model_A2.train(
    input_fn = input_func(100,3000), 
    steps = 5000,
    )

pred_gen = model_A2.predict(predict_func)
predictions_A2 = list(pred_gen)
rmse(predictions_A2)

6286.734806126869

### ELu

In [28]:
model_A3 = DNN_model([7,14,28,14,7], tf.nn.elu, feature_columns)

model_A3.train(
    input_fn = input_func(100,3000), 
    steps = 5000,
    )

pred_gen = model_A3.predict(predict_func)
predictions_A3 = list(pred_gen)
rmse(predictions_A3)

6003.042615422382

### ReLU6

In [29]:
model_A4 = DNN_model([7,14,28,14,7], tf.nn.relu6, feature_columns)

model_A4.train(
    input_fn = input_func(100,3000), 
    steps = 5000,
    )

pred_gen = model_A4.predict(predict_func)
predictions_A4 = list(pred_gen)
rmse(predictions_A4)

16335.916468334073

### ReLU

In [30]:
model_A5 = DNN_model([7,14,28,14,7], tf.nn.relu, feature_columns)

model_A5.train(
    input_fn = input_func(100,3000), 
    steps = 5000,
    )

pred_gen = model_A5.predict(predict_func)
predictions_A5 = list(pred_gen)
rmse(predictions_A5)

5975.391525640065

### Adding Metro Station as a feature

In [31]:
f8_bucket_size = raw_data.station_1.nunique()

f8 = tf.feature_column.categorical_column_with_hash_bucket(key = 'station_1', hash_bucket_size = f8_bucket_size, dtype = tf.string)

feature_columns_v2 = numeric_columns + embedded_columns + [tf.feature_column.embedding_column(f8, 1)]

### Hidden layers [7,14,28,14,7]

In [32]:
model_B1 = DNN_model([7,14,28,14,7], tf.nn.crelu, feature_columns_v2)

model_B1.train(
    input_fn = input_func(100,3000), 
    steps = 5000,
    )

pred_gen = model_B1.predict(predict_func)
predictions_B1 = list(pred_gen)
rmse(predictions_B1)

5906.972291144746

### Hidden layers = [8,16,32,16,8]

In [33]:
model_B2 = DNN_model([8,16,32,16,8], tf.nn.crelu, feature_columns_v2)

model_B2.train(
    input_fn = input_func(100,3000), 
    steps = 5000,
    )

pred_gen = model_B2.predict(predict_func)
predictions_B2 = list(pred_gen)
rmse(predictions_B2)

5908.291311955742

In [34]:
model_B3 = DNN_model([6,12,24,12,6], tf.nn.crelu, feature_columns_v2)

model_B3.train(
    input_fn = input_func(100,3000), 
    steps = 5000,
    )

pred_gen = model_B3.predict(predict_func)
predictions_B3 = list(pred_gen)
rmse(predictions_B3)

6142.231607644819

In [35]:
model_B4 = DNN_model([7,12,28,12,7], tf.nn.crelu, feature_columns_v2)

model_B4.train(
    input_fn = input_func(100,3000), 
    steps = 5000,
    )

pred_gen = model_B4.predict(predict_func)
predictions_B4 = list(pred_gen)
rmse(predictions_B4)

5939.421423702152

## Feature Engineering: LatLng vs. Metro Station
Which of these features is more predictive of the price? For this, I have created a crossed LatLng feature column. In order to do so, first the LatLng data was binned to 100 bins each.

In [36]:
lat_bins = pd.cut(raw_data['lat'], 100, retbins = True, right = True, include_lowest = True)[1]
lng_bins = pd.cut(raw_data['lng'], 100, retbins = True)[1]

f9 = tf.feature_column.numeric_column('lat')
f10 = tf.feature_column.numeric_column('lng')
bucketized_lat = tf.feature_column.bucketized_column(f9, list(lat_bins))
bucketized_lng = tf.feature_column.bucketized_column(f10, list(lng_bins))

crossed_lat_lng = tf.feature_column.crossed_column([bucketized_lat, bucketized_lng], 100*100)



In [37]:
feature_columns_v3 = feature_columns + [tf.feature_column.embedding_column(crossed_lat_lng,1)]

model_C1 = DNN_model([7,14,28,14,7], tf.nn.crelu, feature_columns_v3)

model_C1.train(
    input_fn = input_func(100,3000), 
    steps = 5000,
    )

pred_gen = model_C1.predict(predict_func)
predictions_C1 = list(pred_gen)
rmse(predictions_C1)

5890.563723989405

In [38]:
model_C2 = DNN_model([5,12,32,12,8], tf.nn.crelu, feature_columns_v3)

model_C2.train(
    input_fn = input_func(100,3000), 
    steps = 5000,
    )

pred_gen = model_C2.predict(predict_func)
predictions_C2 = list(pred_gen)
rmse(predictions_C2)

6077.228075326745

In [39]:
model_C3 = DNN_model([5,12,32,12,5], tf.nn.crelu, feature_columns_v3)

model_C3.train(
    input_fn = input_func(140,3000), 
    steps = 5000,
    )

pred_gen = model_C3.predict(predict_func)
predictions_C3 = list(pred_gen)
rmse(predictions_C3)

5991.415947429938

In [40]:
model_C4 = DNN_model([5,12,25,12,5], tf.nn.crelu, feature_columns_v3)

model_C4.train(
    input_fn = input_func(140,3000), 
    steps = 5000,
    )

pred_gen = model_C4.predict(predict_func)
predictions_C4 = list(pred_gen)
rmse(predictions_C4)

6007.096085035796