In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.preprocessing import MinMaxScaler

import torch
from torch import nn
import torch.nn.functional as F
from torch.optim import Adam

%matplotlib inline

Let's check if PyTorch is using GPU

In [32]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
print(f'device: {device}')
print(torch.cuda.get_device_name(0))

device: cuda:0
NVIDIA GeForce RTX 3060


## Data Exploration

The CSV file contained the house prices, the type of house asset (such as Condominium, Bungalow, etc.)

In [33]:
df = pd.read_csv("data_kaggle.csv")
print(len(df))

53883


In [34]:
df.head()

Unnamed: 0,Location,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing
0,"KLCC, Kuala Lumpur","RM 1,250,000",2+1,3.0,2.0,Serviced Residence,"Built-up : 1,335 sq. ft.",Fully Furnished
1,"Damansara Heights, Kuala Lumpur","RM 6,800,000",6,7.0,,Bungalow,Land area : 6900 sq. ft.,Partly Furnished
2,"Dutamas, Kuala Lumpur","RM 1,030,000",3,4.0,2.0,Condominium (Corner),"Built-up : 1,875 sq. ft.",Partly Furnished
3,"Cheras, Kuala Lumpur",,,,,,,
4,"Bukit Jalil, Kuala Lumpur","RM 900,000",4+1,3.0,2.0,Condominium (Corner),"Built-up : 1,513 sq. ft.",Partly Furnished


## Data Cleaning (a.k.a Pre-processing)
We noticed that there are some rows are "NaN", which containing meaningless values.
Before make our hands wet, these values must be replace by mean either its mode or mean
corresponding to its location.

In [35]:
import re
import locale
locale.setlocale(locale.LC_ALL, 'en_US.utf8')

non_decimal = re.compile(r'[^\dx]+')

df['Price'] = df['Price'].apply(lambda x: non_decimal.sub('', str(x)))
df['Price'] = df['Price'].apply(lambda x: int(x) / 1000000 if x != '' else np.nan)

In [36]:
pattern = re.compile(r"[A-Za-z] : (.,?[\d.]+)[xX~]?(\d+)? sq. ft.")

def check_none(val):
    if not (val is None):
        return locale.atof(str(val))
    else:
        return 1

def format_size_col(x: str):
    m = re.search(pattern, str(x))
    if m:
        result = locale.atof(m.groups()[0]) * check_none(m.groups()[1])
        return result / 1000
    else:
        return float('nan')

df['Size'] = df['Size'].apply(lambda x: format_size_col(x))
# df['Size'] = df['Size'].apply(lambda x: np.sqrt(int(x)) if type(x) != str else float('nan'))

In [37]:
def format_room_col(x):
    return eval(str(x) if type(x) == str and x != 'Studio' else 'float("nan")')

df['Rooms'] = df['Rooms'].apply(lambda x: format_room_col(x))

In [38]:
furnishing_dict = {
    'Unfurnished': 0,
    'Partly Furnished': 1,
    'Fully Furnished': 2,
}

df['Furnishing'] = df['Furnishing'].apply(lambda x: int(furnishing_dict[x] if str(x) in furnishing_dict.keys() else 0))

In [39]:
df['Location'] = df['Location'].apply(lambda x: x.removesuffix(', Kuala Lumpur'))

In [40]:
df.head()

Unnamed: 0,Location,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing
0,KLCC,1.25,3.0,3.0,2.0,Serviced Residence,1.335,2
1,Damansara Heights,6.8,6.0,7.0,,Bungalow,6.9,1
2,Dutamas,1.03,3.0,4.0,2.0,Condominium (Corner),1.875,1
3,Cheras,,,,,,,0
4,Bukit Jalil,0.9,5.0,3.0,2.0,Condominium (Corner),1.513,1


### Handling missing values

When dealing with missing data, data scientists use two common methods to solve this 
problem: imputations, or just removing the row that contains missing value.

In this project, we will just drop the missing rows. 

In [41]:
df = df.dropna().reset_index(drop=True)

In [42]:
df

Unnamed: 0,Location,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing
0,KLCC,1.250,3.0,3.0,2.0,Serviced Residence,1.335,2
1,Dutamas,1.030,3.0,4.0,2.0,Condominium (Corner),1.875,1
2,Bukit Jalil,0.900,5.0,3.0,2.0,Condominium (Corner),1.513,1
3,Taman Tun Dr Ismail,5.350,6.0,5.0,4.0,Bungalow,7.200,1
4,Taman Tun Dr Ismail,2.600,5.0,4.0,4.0,Semi-detached House,3.600,1
...,...,...,...,...,...,...,...,...
34424,Kuchai Lama,0.585,4.0,3.0,2.0,Condominium,1.313,0
34425,KL Sentral,1.400,4.0,3.0,2.0,Condominium (Corner),1.544,2
34426,KL Eco City,0.880,1.0,1.0,1.0,Condominium (Corner),0.650,1
34427,Sri Hartamas,2.700,6.0,6.0,3.0,Condominium (Corner),3.973,1


In [43]:
df[['Rooms', 'Bathrooms', 'Car Parks']] = df[['Rooms', 'Bathrooms', 'Car Parks']].astype('float').astype('int')
#df['Location'] = df['Location'].astype('category')

In [44]:
df.head()

Unnamed: 0,Location,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing
0,KLCC,1.25,3,3,2,Serviced Residence,1.335,2
1,Dutamas,1.03,3,4,2,Condominium (Corner),1.875,1
2,Bukit Jalil,0.9,5,3,2,Condominium (Corner),1.513,1
3,Taman Tun Dr Ismail,5.35,6,5,4,Bungalow,7.2,1
4,Taman Tun Dr Ismail,2.6,5,4,4,Semi-detached House,3.6,1


In [45]:
from sklearn.preprocessing import LabelEncoder

df['Location'] = LabelEncoder().fit_transform(df['Location'])
df

Unnamed: 0,Location,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing
0,40,1.250,3,3,2,Serviced Residence,1.335,2
1,26,1.030,3,4,2,Condominium (Corner),1.875,1
2,11,0.900,5,3,2,Condominium (Corner),1.513,1
3,76,5.350,6,5,4,Bungalow,7.200,1
4,76,2.600,5,4,4,Semi-detached House,3.600,1
...,...,...,...,...,...,...,...,...
34424,46,0.585,4,3,2,Condominium,1.313,0
34425,39,1.400,4,3,2,Condominium (Corner),1.544,2
34426,38,0.880,1,1,1,Condominium (Corner),0.650,1
34427,66,2.700,6,6,3,Condominium (Corner),3.973,1


## Building Machine Learning Model

In [46]:
cat_feat = ["Location", "Rooms", "Bathrooms", "Car Parks", "Furnishing"]
out_feat = "Price"

In [47]:
cat = torch.tensor(df[cat_feat].to_numpy(), dtype=torch.int64)
cat

tensor([[40,  3,  3,  2,  2],
        [26,  3,  4,  2,  1],
        [11,  5,  3,  2,  1],
        ...,
        [38,  1,  1,  1,  1],
        [66,  6,  6,  3,  1],
        [63,  4,  2,  3,  1]])

In [48]:
df[cat_feat].to_numpy()

array([[40,  3,  3,  2,  2],
       [26,  3,  4,  2,  1],
       [11,  5,  3,  2,  1],
       ...,
       [38,  1,  1,  1,  1],
       [66,  6,  6,  3,  1],
       [63,  4,  2,  3,  1]])

In [49]:
# continuous features
cont_feat = "Size"
cont = torch.tensor(df[cont_feat].to_numpy(), dtype=torch.float32).reshape(-1, 1)
cont

tensor([[1.3350],
        [1.8750],
        [1.5130],
        ...,
        [0.6500],
        [3.9730],
        [1.3130]])

In [50]:
y = torch.tensor(df["Price"].to_numpy(), dtype=torch.float32).reshape(-1, 1)
y

tensor([[1.2500],
        [1.0300],
        [0.9000],
        ...,
        [0.8800],
        [2.7000],
        [0.5400]])

In [51]:
cat_dims = [len(df[col].unique()) for col in cat_feat]
cat_dims

[82, 17, 17, 21, 3]

In [52]:
embedding_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]
embedding_dims

[(82, 41), (17, 9), (17, 9), (21, 11), (3, 2)]

In [53]:
class SparseTabularNN(nn.Module):
    def __init__(self, embedding_dim, n_cont, out_sz, layers, dropout_rate=0.5) -> None:
        super().__init__()
        self.embeds = nn.ModuleList([
            nn.Embedding(inp, out) for inp, out in embedding_dim
        ])
        self.emb_drop = nn.Dropout(dropout_rate)
        self.bn_cont = nn.BatchNorm1d(n_cont)

        layer_list = []
        n_emb = sum((out for _, out in embedding_dim))
        n_in = n_emb + n_cont

        for i in layers:
            layer_list.append(nn.Linear(n_in, i))
            layer_list.append(nn.ReLU(inplace=True))
            layer_list.append(nn.BatchNorm1d(i))
            layer_list.append(nn.Dropout(dropout_rate))
            n_in = i
        
        layer_list.append(nn.Linear(layers[-1], out_sz))
        
        self.layers = nn.Sequential(*layer_list)
        
    def forward(self, x_cat, x_cont):
        embeddings = [e(x_cat[:,i]) for i, e in enumerate(self.embeds)]
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)

        x2 = self.bn_cont(x_cont)
        x = torch.cat([x, x2], 1)
        x = self.layers(x)

        return x

In [54]:
model = SparseTabularNN(embedding_dims, 1, 1, [100, 50], dropout_rate=0.40)

In [55]:
model.eval()

SparseTabularNN(
  (embeds): ModuleList(
    (0): Embedding(82, 41)
    (1-2): 2 x Embedding(17, 9)
    (3): Embedding(21, 11)
    (4): Embedding(3, 2)
  )
  (emb_drop): Dropout(p=0.4, inplace=False)
  (bn_cont): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=73, out_features=100, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=100, out_features=50, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=50, out_features=1, bias=True)
  )
)

In [56]:
loss_fn = nn.MSELoss()
optimizer = Adam(model.parameters(), lr=0.01)

In [57]:
batch_size=1200
test_size=int(batch_size*0.15)
train_categorical=cat[:batch_size-test_size]
test_categorical=cat[batch_size-test_size:batch_size]
train_cont=cont[:batch_size-test_size]
test_cont=cont[batch_size-test_size:batch_size]
y_train=y[:batch_size-test_size]
y_test=y[batch_size-test_size:batch_size]

In [58]:
len(train_categorical),len(test_categorical),len(train_cont),len(test_cont),len(y_train),len(y_test)

(1020, 180, 1020, 180, 1020, 180)

In [59]:
sum((out for _, out in embedding_dims)) + train_cont

tensor([[73.3350],
        [73.8750],
        [73.5130],
        ...,
        [73.1310],
        [72.6100],
        [73.3130]])

In [60]:
epochs=5000
final_losses=[]
for i in range(epochs):
    i=i+1
    y_pred=model(train_categorical,train_cont)
    loss=torch.sqrt(loss_fn(y_pred,y_train)) ### RMSE
    final_losses.append(loss)
    if i%10==1:
        print("Epoch number: {} and the loss : {}".format(i,loss.item()))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Epoch number: 1 and the loss : 3.2336528301239014
Epoch number: 11 and the loss : 1.1240968704223633
Epoch number: 21 and the loss : 0.8979969024658203
Epoch number: 31 and the loss : 0.802376389503479
Epoch number: 41 and the loss : 0.715266227722168
Epoch number: 51 and the loss : 0.6253294944763184
Epoch number: 61 and the loss : 0.5633143186569214
Epoch number: 71 and the loss : 0.5337095260620117
Epoch number: 81 and the loss : 0.4782934784889221
Epoch number: 91 and the loss : 0.44469568133354187
Epoch number: 101 and the loss : 0.4299977421760559
Epoch number: 111 and the loss : 0.4105254113674164
Epoch number: 121 and the loss : 0.42368102073669434


Epoch number: 131 and the loss : 0.37161555886268616
Epoch number: 141 and the loss : 0.3570900857448578
Epoch number: 151 and the loss : 0.3483647406101227
Epoch number: 161 and the loss : 0.37417981028556824
Epoch number: 171 and the loss : 0.33906158804893494
Epoch number: 181 and the loss : 0.3319062888622284
Epoch number: 191 and the loss : 0.34317922592163086
Epoch number: 201 and the loss : 0.35731634497642517
Epoch number: 211 and the loss : 0.34852805733680725
Epoch number: 221 and the loss : 0.34493327140808105
Epoch number: 231 and the loss : 0.34591561555862427
Epoch number: 241 and the loss : 0.35145124793052673
Epoch number: 251 and the loss : 0.34269195795059204
Epoch number: 261 and the loss : 0.3443027138710022
Epoch number: 271 and the loss : 0.3368266820907593
Epoch number: 281 and the loss : 0.3120606243610382
Epoch number: 291 and the loss : 0.305613249540329
Epoch number: 301 and the loss : 0.30602335929870605
Epoch number: 311 and the loss : 0.31060031056404114
E