In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.preprocessing import MinMaxScaler

import torch
from torch import nn
import torch.nn.functional as F
from torch.optim import Adam

%matplotlib inline

Let's check if PyTorch is using GPU

In [33]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'device: {device}')
print(torch.cuda.get_device_name(0))

device: cuda
NVIDIA GeForce RTX 3060


## Data Exploration

The CSV file contained the house prices, the type of house asset (such as Condominium, Bungalow, etc.)

In [34]:
df = pd.read_csv("data_kaggle.csv")
print(len(df))

53883


In [35]:
df.head()

Unnamed: 0,Location,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing
0,"KLCC, Kuala Lumpur","RM 1,250,000",2+1,3.0,2.0,Serviced Residence,"Built-up : 1,335 sq. ft.",Fully Furnished
1,"Damansara Heights, Kuala Lumpur","RM 6,800,000",6,7.0,,Bungalow,Land area : 6900 sq. ft.,Partly Furnished
2,"Dutamas, Kuala Lumpur","RM 1,030,000",3,4.0,2.0,Condominium (Corner),"Built-up : 1,875 sq. ft.",Partly Furnished
3,"Cheras, Kuala Lumpur",,,,,,,
4,"Bukit Jalil, Kuala Lumpur","RM 900,000",4+1,3.0,2.0,Condominium (Corner),"Built-up : 1,513 sq. ft.",Partly Furnished


## Data Cleaning (a.k.a Pre-processing)
We noticed that there are some rows are "NaN", which containing meaningless values.
Before make our hands wet, these values must be replace by mean either its mode or mean
corresponding to its location.

In [36]:
import re
import locale
locale.setlocale(locale.LC_ALL, 'en_US.utf8')

non_decimal = re.compile(r'[^\dx]+')

df['Price'] = df['Price'].apply(lambda x: non_decimal.sub('', str(x)))
df['Price'] = df['Price'].apply(lambda x: int(x) / 1000000 if x != '' else np.nan)

In [37]:
pattern = re.compile(r"[A-Za-z] : (.,?[\d.]+)[xX~]?(\d+)? sq. ft.")

def check_none(val):
    if not (val is None):
        return locale.atof(str(val))
    else:
        return 1

def format_size_col(x: str):
    m = re.search(pattern, str(x))
    if m:
        result = locale.atof(m.groups()[0]) * check_none(m.groups()[1])
        return result / 1000
    else:
        return float('nan')

df['Size'] = df['Size'].apply(lambda x: format_size_col(x))
# df['Size'] = df['Size'].apply(lambda x: np.sqrt(int(x)) if type(x) != str else float('nan'))

In [38]:
def format_room_col(x):
    return eval(str(x) if type(x) == str and x != 'Studio' else 'float("nan")')

df['Rooms'] = df['Rooms'].apply(lambda x: format_room_col(x))

In [39]:
furnishing_dict = {
    'Unfurnished': 0,
    'Partly Furnished': 1,
    'Fully Furnished': 2,
}

df['Furnishing'] = df['Furnishing'].apply(lambda x: int(furnishing_dict[x] if str(x) in furnishing_dict.keys() else 0))

In [40]:
df['Location'] = df['Location'].apply(lambda x: x.removesuffix(', Kuala Lumpur'))

### Handling missing values

When dealing with missing data, data scientists use two common methods to solve this 
problem: imputations, or just removing the row that contains missing value.

In this project, we will just drop the missing rows. 

In [41]:
df = df.dropna().reset_index(drop=True)

In [42]:
df[['Rooms', 'Bathrooms', 'Car Parks']] = df[['Rooms', 'Bathrooms', 'Car Parks']].astype('float').astype('int')
#df['Location'] = df['Location'].astype('category')

In [43]:
from sklearn.preprocessing import LabelEncoder

df['Location'] = LabelEncoder().fit_transform(df['Location'])
df

Unnamed: 0,Location,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing
0,40,1.250,3,3,2,Serviced Residence,1.335,2
1,26,1.030,3,4,2,Condominium (Corner),1.875,1
2,11,0.900,5,3,2,Condominium (Corner),1.513,1
3,76,5.350,6,5,4,Bungalow,7.200,1
4,76,2.600,5,4,4,Semi-detached House,3.600,1
...,...,...,...,...,...,...,...,...
34424,46,0.585,4,3,2,Condominium,1.313,0
34425,39,1.400,4,3,2,Condominium (Corner),1.544,2
34426,38,0.880,1,1,1,Condominium (Corner),0.650,1
34427,66,2.700,6,6,3,Condominium (Corner),3.973,1


In [44]:
df['Property Type'] = LabelEncoder().fit_transform(df['Property Type'])
df

Unnamed: 0,Location,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing
0,40,1.250,3,3,2,77,1.335,2
1,26,1.030,3,4,2,55,1.875,1
2,11,0.900,5,3,2,55,1.513,1
3,76,5.350,6,5,4,44,7.200,1
4,76,2.600,5,4,4,71,3.600,1
...,...,...,...,...,...,...,...,...
34424,46,0.585,4,3,2,54,1.313,0
34425,39,1.400,4,3,2,55,1.544,2
34426,38,0.880,1,1,1,55,0.650,1
34427,66,2.700,6,6,3,55,3.973,1


# Remove Outliers

In [45]:
df = df.drop(df[df['Price'] >= 20].index)
df

Unnamed: 0,Location,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing
0,40,1.250,3,3,2,77,1.335,2
1,26,1.030,3,4,2,55,1.875,1
2,11,0.900,5,3,2,55,1.513,1
3,76,5.350,6,5,4,44,7.200,1
4,76,2.600,5,4,4,71,3.600,1
...,...,...,...,...,...,...,...,...
34424,46,0.585,4,3,2,54,1.313,0
34425,39,1.400,4,3,2,55,1.544,2
34426,38,0.880,1,1,1,55,0.650,1
34427,66,2.700,6,6,3,55,3.973,1


In [46]:
df = df.drop(df[df['Rooms'] >= 17].index)
df = df.drop(df[df['Car Parks'] >= 10].index)
df = df.drop(df[df['Size'] >= 120].index)

In [47]:
df

Unnamed: 0,Location,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing
0,40,1.250,3,3,2,77,1.335,2
1,26,1.030,3,4,2,55,1.875,1
2,11,0.900,5,3,2,55,1.513,1
3,76,5.350,6,5,4,44,7.200,1
4,76,2.600,5,4,4,71,3.600,1
...,...,...,...,...,...,...,...,...
34424,46,0.585,4,3,2,54,1.313,0
34425,39,1.400,4,3,2,55,1.544,2
34426,38,0.880,1,1,1,55,0.650,1
34427,66,2.700,6,6,3,55,3.973,1


## Building Machine Learning Model

In [48]:
cat_feat = ["Location", "Rooms", "Bathrooms", "Car Parks", "Property Type", "Furnishing"]
cont_feat = 'Size'
out_feat = "Price"

In [49]:
train_size = int(len(df) * 0.8)
train_size

27425

In [50]:
train_set = df[:train_size]
test_set = df[train_size:]

In [51]:
X = train_set.drop(columns=['Price'])
y = train_set[out_feat]

In [52]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [53]:
cat_dims = [len(X[col].unique()) for col in cat_feat]
cat_dims

[80, 13, 14, 9, 87, 3]

In [54]:
embedding_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]

In [55]:
from torch.utils.data import Dataset, DataLoader

class HousePriceDataset(Dataset):
    def __init__(self, X, Y, cat_col_names, cont_col_names, device):
        X = X.copy()
        self.X_cat = torch.tensor(X[cat_col_names].to_numpy(), dtype=torch.int64).to(device) #categorical columns
        self.X_cont = torch.tensor(X[cont_col_names].to_numpy(), dtype=torch.float32).reshape(-1, 1).to(device) #numerical columns
        self.y = torch.tensor(Y.to_numpy(), dtype=torch.float32).reshape(-1, 1).to(device)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X_cat[idx], self.X_cont[idx], self.y[idx]

In [56]:
train_ds = HousePriceDataset(X_train, y_train, cat_col_names=cat_feat, cont_col_names=cont_feat, device=device)
test_ds = HousePriceDataset(X_test, y_test, cat_col_names=cat_feat, cont_col_names=cont_feat, device=device)

In [57]:
train_dl = DataLoader(train_ds, batch_size=1200, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=1200, shuffle=True)

In [58]:
class SparseTabularNN(nn.Module):
    def __init__(self, embedding_dim, n_cont, out_sz, layers, dropout_rate=0.5) -> None:
        super().__init__()
        self.embeds = nn.ModuleList([
            nn.Embedding(inp, out) for inp, out in embedding_dim
        ])
        self.emb_drop = nn.Dropout(dropout_rate)
        self.bn_cont = nn.BatchNorm1d(n_cont)

        layer_list = []
        n_emb = sum(e.embedding_dim for e in self.embeds)
        n_in = n_emb + n_cont

        for i in layers:
            layer_list.append(nn.Linear(n_in, i))
            layer_list.append(nn.ReLU(inplace=True))
            layer_list.append(nn.BatchNorm1d(i))
            layer_list.append(nn.Dropout(dropout_rate))
            n_in = i
        
        layer_list.append(nn.Linear(layers[-1], out_sz))
        
        self.layers = nn.Sequential(*layer_list)
        
    def forward(self, x_cat, x_cont):
        embeddings = [e(torch.clamp(x_cat[:,i], 0, e.num_embeddings - 1)) for i, e in enumerate(self.embeds)]
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)

        x2 = self.bn_cont(x_cont)
        x = torch.cat([x, x2], 1)
        x = self.layers(x)

        return x

In [59]:
model = SparseTabularNN(embedding_dims, 1, 1, [240, 70], dropout_rate=0.40).to(device)

In [60]:
loss_fn = nn.MSELoss()
optimizer = Adam(model.parameters(), lr=0.05, weight_decay=0.0001)

In [61]:
final_losses = []
torch.manual_seed(2000)

for i in range(100):
    model.train()
    total, sum_loss = 0, 0
    for x_cat, x_cont, y in train_dl:
        batch = y.shape[0]
        y_pred = model(x_cat, x_cont)
        loss = torch.sqrt(F.mse_loss(y_pred, y))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total += batch
        sum_loss += batch*(loss.item())

    mean_loss = sum_loss / total
    final_losses.append(mean_loss)

In [62]:
final_losses

[1.3765994632580982,
 0.996559434297852,
 1.0023760051214379,
 0.9607851160800555,
 0.9352491895719995,
 0.882698237026618,
 0.8886164857130651,
 0.8637183938335047,
 0.8722916829553429,
 0.8337168747333363,
 0.8720442360384202,
 0.8598506257223235,
 0.8604935814601895,
 0.8843209192550715,
 0.8370770941350062,
 0.8622472650807015,
 0.8971980828639041,
 0.8346028812059839,
 0.884963325402252,
 0.8780670476807392,
 0.8467044165814694,
 0.8646131130707948,
 0.8568808896604623,
 0.8504644217225998,
 0.8335500215006179,
 0.8376374853776604,
 0.8175178868942296,
 0.8628017955078465,
 0.8616051740176914,
 0.856517003630114,
 0.848659524943683,
 0.8249282605450264,
 0.8524246721671946,
 0.871998388276061,
 0.8734276587569725,
 0.8440888138282484,
 0.8244805337085219,
 0.861878219146346,
 0.8583048056665071,
 0.8651441551928316,
 0.8385987411658115,
 0.8410468637236923,
 0.8593257705841482,
 0.8192773532519693,
 0.8839502694287296,
 0.8398511121788129,
 0.8271936912806119,
 0.8275588377563108,