# Colab commands

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cp /content/drive/MyDrive/Colab\ Notebooks/build_model/TrainValidate.py /content
!cp /content/drive/MyDrive/Colab\ Notebooks/datasets/final_features_removed.csv /content
!cp /content/drive/MyDrive/Colab\ Notebooks/datasets/final_labels_removed.csv /content
!cp /content/drive/MyDrive/Colab\ Notebooks/datasets/final_sources_removed.csv /content
!cp /content/drive/MyDrive/all_removed_weighted_sampler_full_epoch_1000_fold_2.pth /content
!mkdir /content/models

In [None]:
!pip install --quiet torchinfo
!pip install --quiet torch_snippets

# Import packages

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from torch.utils.tensorboard import SummaryWriter

import torchinfo
from torch_snippets import Report
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from TrainValidate import TrainValidate, create_weighted_sampler

device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
elif torch.has_mps:
    device = "mps"

In [3]:
device

'mps'

# Load datasets

In [4]:
features = pd.read_csv("../datasets/final_features_removed.csv")
labels = pd.read_csv("../datasets/final_labels_removed.csv")
sources = pd.read_csv("../datasets/final_sources_removed.csv")

# Create class for collecting data

In [5]:
class PriceDataset(Dataset):
    def __init__(self, features, labels):
        assert len(features) == len(labels)
        self.features = features
        self.labels = labels

    def __getitem__(self, item):
        features = self.features.to_numpy()[item]
        features = torch.tensor(features).float().to(device)

        labels = self.labels.to_numpy()[item]
        price = torch.tensor(labels[1]).float().to(device)

        return features, price

    def __len__(self):
        return len(self.features)

In [6]:
temp = PriceDataset(features, labels)
in_features = len(temp[0][0])

# Building model

In [7]:
class PredictPrice(nn.Module):
    def __init__(self, in_features, num_layers, num_neurons, activation):
        super(PredictPrice, self).__init__()
        self.hidden = nn.Sequential()

        activation = activation.lower()
        activation_fn = None
        if activation == "relu":
            activation_fn = nn.ReLU()
        elif activation == "sigmoid":
            activation_fn = nn.Sigmoid()
        elif activation == "tanh":
            activation_fn = nn.Tanh()

        self.hidden.add_module("input", nn.Linear(in_features, num_neurons))
        for num in range(num_layers):
            self.hidden.add_module("linear{}".format(num), nn.Linear(num_neurons, num_neurons))
            self.hidden.add_module("activation", activation_fn)

        self.price = nn.Sequential(nn.Linear(num_neurons, 1),
                                   activation_fn)

    def forward(self, x):
        out = self.hidden(x)
        out = self.price(out)
        return out.squeeze()

In [8]:
model = PredictPrice(in_features, 10, 256, "sigmoid")
torchinfo.summary(model, input_size=(1, in_features))

Layer (type:depth-idx)                   Output Shape              Param #
PredictPrice                             --                        --
├─Sequential: 1-1                        [1, 256]                  --
│    └─Linear: 2-1                       [1, 256]                  13,312
│    └─Linear: 2-2                       [1, 256]                  65,792
├─Sequential: 1                          --                        --
│    └─Sigmoid: 2-3                      [1, 256]                  --
├─Sequential: 1                          --                        --
│    └─Linear: 2-4                       [1, 256]                  65,792
│    └─Linear: 2-5                       [1, 256]                  65,792
│    └─Linear: 2-6                       [1, 256]                  65,792
│    └─Linear: 2-7                       [1, 256]                  65,792
│    └─Linear: 2-8                       [1, 256]                  65,792
│    └─Linear: 2-9                       [1, 256]        

# Data standardization

In [9]:
scaler = StandardScaler()
scaler.fit(features)
features[:] = scaler.transform(features)

# Split dataset

In [10]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, random_state=1, test_size=0.2)

# Cross validation

In [11]:
def testing(model, x_test, y_test):
    model.eval()

    prices = []
    for i in range(len(x_test)):
        feature = torch.tensor((x_test.iloc[i])).float()[None, :]
        pred_price = model(feature)
        prices.append(pred_price.detach().item())

    mae = mean_absolute_error(y_test["Price / Rent"], prices)
    mse = mean_squared_error(y_test["Price / Rent"], prices)
    r2 = r2_score(y_test["Price / Rent"], prices)

    return mae, mse, r2

In [12]:
def cross_validation(model, epoch_num, batch_size, lr, x_train, y_train, x_test, y_test):
    kfold = KFold(n_splits=5, shuffle=True, random_state=1)
    torch.manual_seed(13)

    metrics = {"mae": np.inf, "mse": np.inf, "r2": -1.0}
    best_model = None

    for fold, (train_id, val_id) in enumerate(kfold.split(x_train.index)):
        train_feature, train_label = x_train.iloc[train_id], y_train.iloc[train_id]
        val_feature, val_label = x_train.iloc[val_id], y_train.iloc[val_id]
        print("\n\n-------------This is fold {}----------------".format(fold))

        train_data = PriceDataset(train_feature, train_label)
        val_data = PriceDataset(val_feature, val_label)
        train_sampler = create_weighted_sampler(train_feature["Sale or Let"].values)
        val_sampler = create_weighted_sampler(val_feature["Sale or Let"].values)

        train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=False, drop_last=True, sampler=train_sampler)
        val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, drop_last=True, sampler=val_sampler)

        model = model.to(device)
        optimizer = optim.Adam(model.parameters(), lr=lr)

        train_validate = TrainValidate(model, nn.MSELoss(), optimizer)
        train_validate.set_loader(train_loader, val_loader)
        train_validate.train(epoch_num)

        mae, mse, r2 = testing(model, x_test, y_test)
        if r2 > metrics["r2"]:
            metrics["r2"] = r2
            metrics["mae"] = mae
            metrics["mse"] = mse
            best_model = model

    return best_model, metrics

In [31]:
epoch_num = [300, 500, 1000, 3000]
batch_size = [4, 8, 16, 32, 64, 128]
lr = [1e-2, 3e-3, 1e-3, 3e-4, 1e-4]
num_layers = [5, 7, 10, 13, 15]
num_neurons = [128, 256, 512, 1024, 2048]
activation = ["sigmoid", "relu", "tanh"]

best_metrics = {"epoch": [], "batch_size": [], "learning_rate": [], "num_layers": [], "num_neurons": [], "activation": [],
           "mae": [], "mse": [], "r2": []}
best_models = []

In [34]:
for epoch in epoch_num:
    for size in batch_size:
        for rate in lr:
            for layer in num_layers:
                for neurons in num_neurons:
                    for activation_fn in activation:
                        print("epoch: {}\tbatch_size: {}\tlearning rate: {}\nneurons: {}\t "
                              "layers: {}\tactivation function: {}".format(epoch, size, rate, neurons, layer, activation_fn))
                        model = PredictPrice(in_features, layer, neurons, activation_fn)

                        model, metrics = cross_validation(model, epoch, size, rate, x_train, y_train, x_train, y_test)
                        model = model.to("cpu")

                        if metrics["r2"] > best_metrics["r2"]:
                            best_metrics["r2"].append(metrics["r2"])
                            best_metrics["mae"].append(metrics["mae"])
                            best_metrics["mse"].append(metrics["mse"])
                            best_metrics["epoch"].append(epoch)
                            best_metrics["batch_size"].append(size)
                            best_metrics["learning_rate"].append(rate)
                            best_metrics["num_layers"].append(layer)
                            best_metrics["num_neurons"].append(neurons)
                            best_metrics["activation"].append(activation_fn)
                            best_models.append(model)

                        print("R2: {}".format(metrics["r2"]))
torch.save(best_metrics, "metrics.pth")

epoch: 300	batch_size: 4	learning rate: 0.01
neurons: 128	 layers: 5	activation function: sigmoid


-------------This is fold 0----------------
EPOCH: 0.756	train_loss: 529513.500	(12.98s - 5138.99s remaining)ning))

KeyboardInterrupt: 