# Predicting House Prices in Saudi Arabia

In this notebook, I'm going to predict the house prices in Saudi Arabia using four methods: Deep Neural Networks by FastAI, Tree Methods: XGBoost, Random Trees, and finally, good ol' linear regression.


In [None]:
#NB: Kaggle requires phone verification to use the internet or a GPU. If you haven't done that yet, the cell below will fail
#    This code is only here to check that your internet is enabled. It doesn't do anything else.
#    Here's a help thread on getting your phone number verified: https://www.kaggle.com/product-feedback/135367

import socket,warnings
try:
    socket.setdefaulttimeout(1)
    socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect(('1.1.1.1', 53))
except socket.error as ex: raise Exception("STOP: No internet. Click '>|' in top right and set 'Internet' switch to on")

In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# We want our results to be reproducible so we set a specific seed
import fastai.torch_core as core
core.set_seed(42,True)

## Getting familiar with the dataset

In [None]:
!ls ../input/saudi-arabia-real-estate-aqar

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None
df = pd.read_csv("/kaggle/input/real-estate-training-dataset-v2/training_data.csv")
df.head()

In [None]:
i = 0
for obj in df["縣市"]:
    if obj == "台北市":
        df.at[i, "縣市"] = 0
    elif obj == "新北市":
        df.at[i, "縣市"] = 1
    elif obj == "桃園市":
        df.at[i, "縣市"] = 2
    elif obj == "台中市":
        df.at[i, "縣市"] = 3
    elif obj == "台南市":
        df.at[i, "縣市"] = 4
    elif obj == "高雄市":
        df.at[i, "縣市"] = 5
    elif obj == "新竹縣":
        df.at[i, "縣市"] = 6
    elif obj == "苗栗縣":
        df.at[i, "縣市"] = 7
    elif obj == "彰化縣":
        df.at[i, "縣市"] = 8
    elif obj == "南投縣":
        df.at[i, "縣市"] = 9
    elif obj == "雲林縣":
        df.at[i, "縣市"] = 10
    elif obj == "嘉義縣":
        df.at[i, "縣市"] = 11
    elif obj == "屏東縣":
        df.at[i, "縣市"] = 12
    elif obj == "宜蘭縣":
        df.at[i, "縣市"] = 13
    elif obj == "花蓮縣":
        df.at[i, "縣市"] = 14
    elif obj == "台東縣":
        df.at[i, "縣市"] = 15
    elif obj == "澎湖縣":
        df.at[i, "縣市"] = 16
    elif obj == "金門縣":
        df.at[i, "縣市"] = 17
    elif obj == "連江縣":
        df.at[i, "縣市"] = 18
    elif obj == "基隆市":
        df.at[i, "縣市"] = 19
    elif obj == "新竹市":
        df.at[i, "縣市"] = 20
    elif obj == "嘉義市":
        df.at[i, "縣市"] = 21
    else:
        print(obj)
    i = i+1

In [None]:
df = df.drop(['ID'],axis=1)
df = df.drop(['鄉鎮市區'], axis=1)
df = df.drop(['路名'], axis=1)
df = df.drop(['使用分區'], axis=1)
df = df.drop(['主要用途'], axis=1)
df = df.drop(['主要建材'], axis=1)
df = df.drop(['建物型態'], axis=1)
df = df.drop(['備註'], axis=1)


In [None]:
df.describe()

In [None]:
df.columns

In [None]:
# Maybe price is correlated with the property size?
df.plot(x='建物面積', y='單價', kind="scatter", xlim=(0, 20))

## A bit of preproccessing

I'm going to remove outliers in the price column and make the prices log-prices, because DNN's work better with small numbers


In [None]:
print("Number of Samples Before removing outliers:")
print(df.shape[0])
df = df[(np.abs(stats.zscore(df['單價'])) < 4)]
print("Number of Samples After removing outliers:")
print(df.shape[0])


#df['單價'] = np.log(df['單價'])

### Observations

We have 24 features, some are continious, some are categorical.

How I choose features as cont or cat for this dataset:
- If the feature is binary (0, 1) I made it categorical (e.g. fireplace, basement, furnished)
- if the feature is (0..n) I made it continious (e.g. size, property_age, bedrooms)

# Building the Models

## Deep Neural Network

using FastAI, I built a neural network with two hidden layers: 1024, and 512. My choice was arbitrary.

In [None]:
import fastai.tabular.all as fast
splits = fast.RandomSplitter(valid_pct=0.2)(fast.range_of(df))
to = fast.TabularPandas(df, procs=[fast.Normalize, fast.Categorify], cat_names=['縣市', '移轉層次', '總樓層數', '車位個數'], cont_names=['土地面積', '屋齡', '建物面積', '車位面積', '橫坐標', '縱坐標', '主建物面積', '陽台面積', '附屬建物面積'],
                       y_names='單價', y_block=fast.RegressionBlock(), splits=splits)

to.xs.iloc[:10]

In [None]:
dls = to.dataloaders(bs=32)

dls.show_batch()

In [None]:
max_log_y = np.max(df['單價']) * 1.2
y_range = fast.torch.tensor([0, max_log_y])
tc = fast.tabular_config(ps=[0.001, 0.01], embed_p=0.04, y_range=y_range)
learn = fast.tabular_learner(dls, layers=[1024, 512],
                        metrics=[fast.exp_rmspe, fast.R2Score(), fast.rmse],
                        config=tc,
                        loss_func=fast.MSELossFlat())

# Note that you need to exp(rmse) to obtain the true RMSE, since our target variable is a log.

learn.summary()


In [None]:
lr = learn.lr_find(suggest_funcs=(fast.minimum, fast.steep, fast.valley, fast.slide))

### Fitting the model

The choice of the learning rate is arbitrary. I choose the valley of the Loss function

In [None]:
learn.fit_one_cycle(10, lr.valley)

In [None]:
learn.export('model')

# This is needed to free up memory. Python seems to leave the model in memory if not explicitly deleted
del learn

In [None]:
learn = fast.load_learner('model')

In [None]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
# Sanity Check to see if our model is actually accurate
import math

test_df = df

dl = learn.dls.test_dl(test_df)
raw_pred, raw_targets = learn.get_preds(dl=dl)
pred = np.exp(raw_pred).numpy().T[0]
targets = np.exp(raw_targets).numpy().T[0]

rmse = np.linalg.norm(pred - targets) / np.sqrt(len(targets))
print("rmse:", rmse)
print("actual:", targets[:10])
print("predicted:",pred[:10])

In [None]:
dnn_rmse = math.exp(learn.recorder.metrics[2].value)
dnn_r2 = learn.recorder.metrics[1].value
dnn_mape = mean_absolute_percentage_error(raw_targets, raw_pred)
print(learn.recorder.metrics[0].name + ":", learn.recorder.metrics[0].value)
print("r2:", dnn_r2)
print("rmse:", dnn_rmse)
print("mape:", dnn_mape)
plt.figure(figsize=(8,5))
plt.scatter(raw_targets, raw_pred)
plt.xlabel("target value") #x label
plt.ylabel("predictvalue") #y label
plt.show()

# Tree methods

First we'll need to obtain the train and test set from the FastAI TabularPandas object

In [None]:
X_train, y_train = to.train.xs, to.train.ys.values.ravel()
X_test, y_test = to.valid.xs, to.valid.ys.values.ravel()

X_train

## Random Forest

We'll be using Scikit-learn implementation of random forests

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

rf = RandomForestRegressor(max_depth=2, random_state=0)
rf.fit(X_train, y_train)

In [None]:

def rmse(y, y_pred):
    return np.exp(np.sqrt(np.mean((y - y_pred) ** 2)))

rf_rmse = rmse(y_test, rf.predict(X_test))
rf_mape = mean_absolute_percentage_error(y_test, rf.predict(X_test))
rf_r2 = rf.score(X_test, y_test)

print("RMSE:", rf_rmse)
print("mape:", rf_mape)
print("R2 Score:", rf_r2)
plt.figure(figsize=(8,5))
plt.scatter(y_test, rf.predict(X_test))
plt.xlabel("target value") #x label
plt.ylabel("predictvalue") #y label
plt.show()

## Gradient Boosting

In [None]:
import xgboost as xgb
model = xgb.XGBRegressor(n_estimators = 100, max_depth=8, learning_rate=0.1, subsample=0.5)
xgb_model = model.fit(X_train, y_train)

In [None]:
xgb_pred = xgb_model.predict(X_test)

In [None]:
xgb_rmse = rmse(y_test, xgb_pred)
xgb_mape = mean_absolute_percentage_error(y_test, xgb_pred)
xgb_r2 = r2_score(y_test, xgb_pred)
print("rmse:", xgb_rmse)
print("mape:", xgb_mape)
print("R2 Score:", xgb_r2)
plt.figure(figsize=(8,5))
plt.scatter(y_test,xgb_pred)
plt.xlabel("target value") #x label
plt.ylabel("predictvalue") #y label
plt.show()

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, y_train)

In [None]:
lm_rmse = rmse(y_test, lm.predict(X_test))
lm_mape = mean_absolute_percentage_error(y_test, lm.predict(X_test))
lm_r2 = lm.score(X_test, y_test)

print("rmse:", lm_rmse)
print("mape:", lm_mape)
print("R2 Score:", lm_r2)
plt.figure(figsize=(8,5))
plt.scatter(y_test, lm.predict(X_test))
plt.xlabel("target value") #x label
plt.ylabel("predictvalue") #y label
plt.show()

# Conclusions

We find that gradient boosting and DNN's yield the best results for this dataset, with gradient boosting pulling off a slight edge in RMSE and DNN's having better coefficient of determination

One thing to notice in the DNN case we had a signficantly higher RMSE on the full dataset.

Finally, another thing while working on this project I noticed that when I increased the number of parameters for the DNN by adjusting layers, it resulted in better performance on RMSE and R2. However, it took much longer to finish training. it seems that DNN's scale better if we had more data and increased the number of parameters

In [None]:
print("### DNN ###")
print("rmse:", dnn_rmse)
print("r2:", dnn_r2)
print("mape:", dnn_mape)

print("### Random Forest ###")
print("rmse:", rf_rmse)
print("r2:", rf_r2)
print("mape:", rf_mape)

print("### Gradient Boosting ###")
print("rmse:", xgb_rmse)
print("r2:", xgb_r2)
print("mape:", xgb_mape)

print("### Linear Regression ###")
print("rmse:", lm_rmse)
print("r2:", lm_r2)
print("mape:", lm_mape)