# Commands in Colab

In [None]:
!cp /content/drive/MyDrive/Colab\ Notebooks/build_model/TrainValidate.py /content
!cp /content/drive/MyDrive/Colab\ Notebooks/datasets/final_features_removed.csv /content
!cp /content/drive/MyDrive/Colab\ Notebooks/datasets/final_labels_removed.csv /content
!cp /content/drive/MyDrive/Colab\ Notebooks/datasets/final_sources_removed.csv /content
!mkdir /content/models

In [21]:
!pip install --quiet torchinfo
!pip install --quiet torch_snippets

# Import packages

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from torch.utils.tensorboard import SummaryWriter

import torchinfo
from torch_snippets import Report
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from TrainValidate import TrainValidate, create_weighted_sampler

device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
elif torch.has_mps:
    device = "mps"

In [3]:
device

'mps'

# Load datasets

In [4]:
features = pd.read_csv("../datasets/final_features_removed.csv")
labels = pd.read_csv("../datasets/final_labels_removed.csv")
sources = pd.read_csv("../datasets/final_sources_removed.csv")

# Create class for collecting data

In [5]:
class PriceDataset(Dataset):
    def __init__(self, features, labels):
        assert len(features) == len(labels)
        self.features = features
        self.labels = labels

    def __getitem__(self, item):
        features = self.features.to_numpy()[item]
        features = torch.tensor(features).float().to(device)

        labels = self.labels.to_numpy()[item]
        price = torch.tensor(labels[1]).float().to(device)

        return features, price

    def __len__(self):
        return len(self.features)

In [6]:
temp = PriceDataset(features, labels)
in_features = len(temp[0][0])

# Complex model (linear layers and activation functions)
## Build the model

In [20]:
class PredictPrice(nn.Module):
    def __init__(self, in_features):
        super(PredictPrice, self).__init__()
        self.hidden = nn.Sequential(nn.Linear(in_features, 128),
                                    nn.ReLU(),
                                    nn.Linear(128, 128),
                                    nn.ReLU(),
                                    nn.Linear(128, 256),
                                    nn.ReLU(),
                                    nn.Linear(256, 1024),
                                    nn.ReLU(),
                                    nn.Linear(1024, 2048),
                                    nn.ReLU(),
                                    nn.Linear(2048, 2048),
                                    nn.ReLU(),
                                    nn.Linear(2048, 2048),
                                    nn.ReLU(),
                                    nn.Linear(2048, 1024),
                                    nn.ReLU(),
                                    nn.Linear(1024, 256),
                                    nn.ReLU(),
                                    nn.Linear(256, 128),
                                    nn.ReLU())
        self.price = nn.Sequential(nn.Linear(128, 1),
                                   nn.ReLU())

    def forward(self, x):
        out = self.hidden(x)
        out = self.price(out)
        return out.squeeze()

## Data standardization

In [17]:
scaler = StandardScaler()
scaler.fit(features)
features[:] = scaler.transform(features)

## Split datasets

In [18]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, random_state=1, test_size=0.2)

## Cross validation

In [21]:
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
torch.manual_seed(13)
in_features = len(x_train.iloc[0])
epochs = 1000

for fold, (train_id, val_id) in enumerate(kfold.split(x_train.index)):
    train_feature, train_label = x_train.iloc[train_id], y_train.iloc[train_id]
    val_feature, val_label = x_train.iloc[val_id], y_train.iloc[val_id]
    print("\n\n-------------This is fold {}----------------".format(fold))

    train_data = PriceDataset(train_feature, train_label)
    val_data = PriceDataset(val_feature, val_label)
    train_sampler = create_weighted_sampler(train_feature["Sale or Let"].values)
    val_sampler = create_weighted_sampler(val_feature["Sale or Let"].values)

    train_loader = DataLoader(train_data, batch_size=32, shuffle=False, drop_last=True, sampler=train_sampler)
    val_loader = DataLoader(val_data, batch_size=32, shuffle=False, drop_last=True, sampler=val_sampler)

    model = PredictPrice(in_features).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    train_validate = TrainValidate(model, nn.MSELoss(), optimizer)
    train_validate.set_loader(train_loader, val_loader)
    train_validate.train(epochs)

    train_validate.save_model("models/all_complex_full_epoch_1000_fold_{}.pth".format(fold))



-------------This is fold 0----------------
EPOCH: 0.883	train_loss: 3656614912.000	(6.47s - 7324.44s remaining))))

KeyboardInterrupt: 

## Test performance

In [69]:
filename = "models/all_complex_full_epoch_1000_fold_0.pth"

model = torch.load(filename, map_location=torch.device("cpu"))
model.eval()

prices = []
for i in range(len(x_test)):
    feature = torch.tensor((x_test.iloc[i])).float()[None, :]
    pred_price = model(feature)
    prices.append(pred_price.detach().item())

pred = pd.DataFrame({"PredictPrice": prices, "Price": y_test["Price / Rent"]})

In [70]:
error = []
for i in range(len(pred)):
    truth = pred["Price"].iloc[i]
    predict = pred["PredictPrice"].iloc[i]
    error.append(abs(truth - predict) / truth)

avg_error = sum(error) / len(error)
error = pd.DataFrame({"Error": error})
error = error.rename(index={i: j for i, j in zip(error.index, pred.index)})
pred = pd.concat([pred, error], axis=1)

In [71]:
pred

Unnamed: 0,PredictPrice,Price,Error
8301,0.0,1300.0,1.0
2973,0.0,425000.0,1.0
5494,0.0,800000.0,1.0
1475,0.0,265000.0,1.0
1321,0.0,140000.0,1.0
...,...,...,...
7917,0.0,290000.0,1.0
4395,0.0,200000.0,1.0
3057,0.0,325000.0,1.0
8669,0.0,525.0,1.0


In [72]:
print("MAE: ", mean_absolute_error(y_test["Price / Rent"], prices))
print("MSE: ", mean_squared_error(y_test["Price / Rent"], prices))
print("R2:", r2_score(y_test["Price / Rent"], prices))

MAE:  193166.9460811562
MSE:  70788722849.00833
R2: -1.1146582873316402


# Build the model (residual connection)

In [7]:
class ResidualBlock(nn.Module):
    def __init__(self, in_features, neurons):
        super(ResidualBlock, self).__init__()

        self.hidden = nn.Sequential(nn.Linear(in_features, neurons),
                                    nn.ReLU(),
                                    nn.Linear(neurons, neurons),
                                    nn.ReLU(),
                                    nn.Linear(neurons, neurons),
                                    nn.ReLU(),
                                    nn.Linear(neurons, neurons),
                                    nn.ReLU(),
                                    nn.Linear(neurons, in_features),
                                    nn.BatchNorm1d(in_features))
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.hidden(x)
        out += x
        out = self.relu(out)

        return out


class PredictPrice(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.in_features = in_features

        self.hidden = nn.Sequential(ResidualBlock(in_features, 512),
                                    ResidualBlock(in_features, 1024),
                                    ResidualBlock(in_features, 1024),
                                    ResidualBlock(in_features, 256),
                                    ResidualBlock(in_features, 128))
        self.price = nn.Sequential(nn.Linear(in_features, 1),
                                   nn.ReLU())

    def forward(self, x):
        x = self.hidden(x)
        price = self.price(x)
        return price.squeeze()

In [8]:
model = PredictPrice(in_features)
torchinfo.summary(model, input_size=(1, in_features))

Layer (type:depth-idx)                   Output Shape              Param #
PredictPrice                             --                        --
├─Sequential: 1-1                        [1, 51]                   --
│    └─ResidualBlock: 2-1                [1, 51]                   --
│    │    └─Sequential: 3-1              [1, 51]                   840,857
│    │    └─ReLU: 3-2                    [1, 51]                   --
│    └─ResidualBlock: 2-2                [1, 51]                   --
│    │    └─Sequential: 3-3              [1, 51]                   3,254,425
│    │    └─ReLU: 3-4                    [1, 51]                   --
│    └─ResidualBlock: 2-3                [1, 51]                   --
│    │    └─Sequential: 3-5              [1, 51]                   3,254,425
│    │    └─ReLU: 3-6                    [1, 51]                   --
│    └─ResidualBlock: 2-4                [1, 51]                   --
│    │    └─Sequential: 3-7              [1, 51]                  

In [67]:
writer = SummaryWriter('runs/fashion_mnist_experiment_1')

temp_data = PriceDataset(features, labels)
temp_loader = DataLoader(temp_data, batch_size=16, drop_last=True)

x, y = next(iter(temp_loader))
x = x.to("cpu")

model = ResidualBlock(x.shape[1], 1024)

yhat = model(x)
writer.add_graph(model, x)

## Data standardization

In [9]:
scaler = StandardScaler()
scaler.fit(features)
features[:] = scaler.transform(features)

## Prepare the data

In [10]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, random_state=1, test_size=0.2)

## Cross validation

In [13]:
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
torch.manual_seed(13)
in_features = len(x_train.iloc[0])
epochs = 1000

for fold, (train_id, val_id) in enumerate(kfold.split(x_train.index)):
    train_feature, train_label = x_train.iloc[train_id], y_train.iloc[train_id]
    val_feature, val_label = x_train.iloc[val_id], y_train.iloc[val_id]
    print("\n\n-------------This is fold {}----------------".format(fold))

    train_data = PriceDataset(train_feature, train_label)
    val_data = PriceDataset(val_feature, val_label)
    train_sampler = create_weighted_sampler(train_feature["Sale or Let"].values)
    val_sampler = create_weighted_sampler(val_feature["Sale or Let"].values)

    train_loader = DataLoader(train_data, batch_size=32, shuffle=False, drop_last=True, sampler=train_sampler)
    val_loader = DataLoader(val_data, batch_size=32, shuffle=False, drop_last=True, sampler=val_sampler)

    model = PredictPrice(in_features).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    train_validate = TrainValidate(model, nn.MSELoss(), optimizer)
    train_validate.set_loader(train_loader, val_loader)
    train_validate.train(epochs)

    train_validate.save_model("models/all_resnet_full_epoch_1000_fold_{}.pth".format(fold))



-------------This is fold 0----------------
EPOCH: 0.856	train_loss: 26090115072.000	(6.97s - 8134.60s remaining)))

KeyboardInterrupt: 

## Testing the model with residual connection

In [31]:
filename = "models/all_resnet_full_epoch_1000_fold_3.pth"

model = torch.load(filename, map_location=torch.device("cpu"))
model.eval()

prices = []
for i in range(len(x_test)):
    feature = torch.tensor((x_test.iloc[i])).float()[None, :]
    pred_price = model(feature)
    prices.append(pred_price.detach().item())

pred = pd.DataFrame({"PredictPrice": prices, "Price": y_test["Price / Rent"]})

In [32]:
error = []
for i in range(len(pred)):
    truth = pred["Price"].iloc[i]
    predict = pred["PredictPrice"].iloc[i]
    error.append(abs(truth - predict) / truth)

avg_error = sum(error) / len(error)
error = pd.DataFrame({"Error": error})
error = error.rename(index={i: j for i, j in zip(error.index, pred.index)})
pred = pd.concat([pred, error], axis=1)

In [33]:
pred

Unnamed: 0,PredictPrice,Price,Error
8301,460.907501,1300.0,0.645456
2973,275557.531250,425000.0,0.351629
5494,666530.687500,800000.0,0.166837
1475,243279.046875,265000.0,0.081966
1321,121057.187500,140000.0,0.135306
...,...,...,...
7917,199020.687500,290000.0,0.313722
4395,175490.937500,200000.0,0.122545
3057,224446.703125,325000.0,0.309395
8669,2131.114258,525.0,3.059265


In [34]:
print("MAE: ", mean_absolute_error(y_test["Price / Rent"], prices))
print("MSE: ", mean_squared_error(y_test["Price / Rent"], prices))
print("R2:", r2_score(y_test["Price / Rent"], prices))

MAE:  36776.657033576776
MSE:  4615504316.348444
R2: 0.862121902190556
