# Colab commands

In [1]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
!cp /content/drive/MyDrive/Colab\ Notebooks/build_model/TrainValidate.py /content
!cp /content/drive/MyDrive/Colab\ Notebooks/datasets/final_features_removed.csv /content
!cp /content/drive/MyDrive/Colab\ Notebooks/datasets/final_labels_removed.csv /content
!cp /content/drive/MyDrive/Colab\ Notebooks/datasets/final_sources_removed.csv /content
!cp /content/drive/MyDrive/all_removed_weighted_sampler_full_epoch_1000_fold_2.pth /content
!cp /content/drive/MyDrive/Colab\ Notebooks/datasets/final_features_prices.csv /content
!mkdir /content/models

In [None]:
!pip install --quiet torchinfo
!pip install --quiet torch_snippets

# Import packages

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from torch.utils.tensorboard import SummaryWriter

import torchinfo
from torch_snippets import Report
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from TrainValidate import TrainValidate, create_weighted_sampler

device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
elif torch.has_mps:
    device = "mps"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device

'mps'

# Class for collecting data

In [3]:
class PriceDataset(Dataset):
    def __init__(self, features, labels):
        assert len(features) == len(labels)
        self.features = features
        self.labels = labels

    def __getitem__(self, item):
        features = self.features.to_numpy()[item]
        features = torch.tensor(features).float().to(device)

        labels = self.labels.to_numpy()[item]
        price = torch.tensor([labels[1]]).float().to(device)

        return features, price

    def __len__(self):
        return len(self.features)

# Build the model

In [4]:
class PredictPrice(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.in_features = in_features

        self.hidden = nn.Sequential(nn.Linear(in_features, 128),
                                    nn.ReLU(),
                                    nn.Linear(128, 128),
                                    nn.ReLU(),
                                    nn.Linear(128, 256),
                                    nn.ReLU(),
                                    nn.Linear(256, 128),
                                    nn.ReLU())
        self.price = nn.Sequential(nn.Linear(128, 1),
                                   nn.ReLU())

    def forward(self, x):
        x = self.hidden(x)
        price = self.price(x)
        return price.squeeze()

# Prepare data

In [5]:
features = pd.read_csv("../datasets/final_features_removed.csv")
labels = pd.read_csv("../datasets/final_labels_removed.csv")

# remove feature
features = features.loc[:, features.columns != "Postcode"]

scaler = StandardScaler()
scaler.fit(features)
features[:] = scaler.transform(features)

x_train, x_test, y_train, y_test = train_test_split(features, labels, random_state=1, test_size=0.1)

# Train the model

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
in_features = len(x_train.iloc[0])
epochs = 1000

for fold, (train_id, val_id) in enumerate(kfold.split(x_train.index)):
    train_feature, train_label = x_train.iloc[train_id], y_train.iloc[train_id]
    val_feature, val_label = x_train.iloc[val_id], y_train.iloc[val_id]
    print("\n\n-------------This is fold {}----------------".format(fold))

    train_data = PriceDataset(train_feature, train_label)
    val_data = PriceDataset(val_feature, val_label)

    train_sampler = create_weighted_sampler(train_feature["Sale or Let"].values)
    val_sampler = create_weighted_sampler(val_feature["Sale or Let"].values)

    train_loader = DataLoader(train_data, batch_size=32, drop_last=True, sampler=train_sampler)
    val_loader = DataLoader(val_data, batch_size=32, drop_last=True, sampler=val_sampler)

    model = PredictPrice(in_features).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    train_validate = TrainValidate(model, nn.MSELoss(), optimizer)
    train_validate.set_loader(train_loader, val_loader)
    train_validate.train(epochs)

    train_validate.save_model("models/price_ablation_postcode_fold_{}.pth".format(fold))

# Test performance

In [19]:
filename = "models/price_ablation_postcode_fold_0.pth"

model = torch.load(filename, map_location=torch.device("cpu"))
model.eval()

prices = []
for i in range(len(x_test)):
    feature = torch.tensor((x_test.iloc[i])).float()
    pred_price = model(feature)
    prices.append(pred_price.detach().item())

pred = pd.DataFrame({"PredictPrice": prices, "Price": y_test["Price / Rent"]})

error = []
for i in range(len(pred)):
    truth = pred["Price"].iloc[i]
    predict = pred["PredictPrice"].iloc[i]
    error.append(abs(truth - predict) / truth)

avg_error = sum(error) / len(error)
error = pd.DataFrame({"Error": error})
error = error.rename(index={i: j for i, j in zip(error.index, pred.index)})
pred = pd.concat([pred, error], axis=1)
pred

Unnamed: 0,PredictPrice,Price,Error
8301,858.365723,1300.0,0.339719
2973,421742.562500,425000.0,0.007665
5494,789532.562500,800000.0,0.013084
1475,139683.500000,265000.0,0.472892
1321,131880.515625,140000.0,0.057996
...,...,...,...
2680,858.365723,700.0,0.226237
3169,391616.312500,340000.0,0.151813
8816,129942.148438,90000.0,0.443802
1172,858.365723,765.0,0.122047


In [20]:
print("MAE: ", mean_absolute_error(y_test["Price / Rent"], prices))
print("MSE: ", mean_squared_error(y_test["Price / Rent"], prices))
print("R2:", r2_score(y_test["Price / Rent"], prices))

MAE:  42961.96455810547
MSE:  5279288158.072391
R2: 0.8464540169760841


# Alter the inputs
## Load datasets

In [94]:
features = pd.read_csv("../datasets/final_features_removed.csv")
labels = pd.read_csv("../datasets/final_labels_removed.csv")

features["other number"].loc[8301] += 4
features.iloc[8301]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features["other number"].loc[8301] += 4


Postcode                                      1462.0
Sale or Let                                      0.0
Price Qualifier                                  3.0
DESC Council Tax Band                            2.0
RTD3316_condition1 - Condition Description       1.0
# of Enquiry or viewings                         0.0
# of Apps/Offers                                 0.0
bedroom number                                   3.0
kitchen number                                   1.0
living number                                    1.0
bathroom number                                  1.0
dining number                                    1.0
other number                                     7.0
Allocated                                        0.0
Communal                                         0.0
Covered                                          0.0
Driveway                                         0.0
Garage                                           0.0
Gated                                         

## Prepare data

In [95]:
scaler = StandardScaler()
scaler.fit(features)
features[:] = scaler.transform(features)

x_train, x_test, y_train, y_test = train_test_split(features, labels, random_state=1, test_size=0.1)

## Predict output

In [96]:
filename = "models/all_removed_full_epoch_1000_fold_1.pth"

model = torch.load(filename, map_location=torch.device("cpu"))
model.eval()

prices = []
for i in range(len(x_test)):
    feature = torch.tensor((x_test.iloc[i])).float()
    pred_price = model(feature)
    prices.append(pred_price.detach().item())

pred = pd.DataFrame({"PredictPrice": prices, "Price": y_test["Price / Rent"]})
pred

Unnamed: 0,PredictPrice,Price
8301,1109.529053,1300.0
2973,369834.625000,425000.0
5494,614322.562500,800000.0
1475,192804.359375,265000.0
1321,79255.039062,140000.0
...,...,...
2680,0.000000,700.0
3169,420017.656250,340000.0
8816,86790.773438,90000.0
1172,219.496323,765.0
