<a href="https://colab.research.google.com/github/monkofwst/HousePricesKaggle/blob/main/HousePricesTrainingV2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import torch

BATCH_SIZE = 64
NUM_EPOCHS = 10

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
import torch
import pandas as pd
from torch.utils.data import Dataset
import numpy as np


class CSVDataset(Dataset):
    def __init__(self, transform=None, train=True):
        self.data = pd.read_csv('train_clean.csv')
        if train:
            self.data = self.data.iloc[ : int(len(self.data) * 0.8)]
        else:  # if test
            self.data = self.data.iloc[int(len(self.data) * 0.8) : ]
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        sample = (row[:-1], row[-1])
        if self.transform:
            sample = self.transform(sample)
        return sample

In [3]:
class ToTensor:
    def __call__(self, sample: np.array):
        X = sample[0]
        y = sample[1]
        return torch.tensor(X), torch.tensor(y)

In [4]:
train = CSVDataset(transform=ToTensor(), train=True)
test = CSVDataset(transform=ToTensor(), train=False)

In [5]:
train[0]

(tensor([ 0.0733,  3.0000, -0.2079, -0.2071,  1.0000,  3.0000,  3.0000,  0.0000,
          4.0000,  0.0000,  5.0000,  2.0000,  2.0000,  0.0000,  5.0000,  0.6513,
         -0.5170,  1.0506,  0.8784,  1.0000,  1.0000, 12.0000, 13.0000,  0.5098,
          2.0000,  4.0000,  2.0000,  2.0000,  3.0000,  3.0000,  2.0000,  0.5752,
          5.0000, -0.2886, -0.9443, -0.4591,  1.0000,  0.0000,  1.0000,  4.0000,
         -0.7932,  1.1615, -0.1202,  0.3702,  1.1074, -0.2410,  0.7895,  1.2272,
          0.1637, -0.2114,  2.0000,  0.9119,  6.0000, -0.9509,  1.0000,  0.9921,
          1.0000,  0.3116,  0.3509,  4.0000,  4.0000,  2.0000, -0.7519,  0.2164,
         -0.3592, -0.1163, -0.2701, -0.0687, -0.0877, -1.5986,  0.1387,  8.0000,
          4.0000], dtype=torch.float64),
 tensor(0.3472, dtype=torch.float64))

In [6]:
from torch.utils.data import DataLoader

train_dl = DataLoader(train, batch_size=32, shuffle=True)
test_dl = DataLoader(test, batch_size=32, shuffle=True)

In [7]:
from torch import nn


class HousePricesModelV0(nn.Module):
    def __init__(self, input_shape, hidden_size):
        super().__init__()
        self.block = nn.Sequential(
            nn.Linear(input_shape, hidden_size),
            nn.Linear(hidden_size, hidden_size),
            nn.Linear(hidden_size, hidden_size),
            nn.Linear(hidden_size, 1),
        )

    def forward(self, x):
        return self.block(x)


model_0 = HousePricesModelV0(input_shape=73, hidden_size=10).to(device)

In [8]:
loss_fn = nn.L1Loss()
optimizer = torch.optim.Adam(params=model_0.parameters())

In [9]:
for epoch in range(NUM_EPOCHS):
    print(f'Epoch {epoch}')
    train_loss = 0
    model_0.train()
    for X, y in train_dl:
        X, y = X.to(dtype=torch.float32, device=device), y.to(dtype=torch.float32, device=device)
        logits = model_0(X)
        loss = loss_fn(logits.squeeze(), y)
        train_loss += loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    train_loss /= len(train_dl)

    test_loss = 0
    model_0.eval()
    with torch.inference_mode():
        for X, y in test_dl:
            X, y = X.to(dtype=torch.float32, device=device), y.to(dtype=torch.float32, device=device)
            test_logits = model_0(X)
            test_loss += loss_fn(test_logits.squeeze(), y)
        test_loss /= len(test_dl)
    print(f'Train Loss {train_loss} | Test Loss {test_loss}')

Epoch 0
Train Loss 0.5909215211868286 | Test Loss 0.4357468783855438
Epoch 1
Train Loss 0.34653839468955994 | Test Loss 0.3431681990623474
Epoch 2
Train Loss 0.28585919737815857 | Test Loss 0.29485654830932617
Epoch 3
Train Loss 0.2600274384021759 | Test Loss 0.2900572717189789
Epoch 4
Train Loss 0.24601230025291443 | Test Loss 0.27209633588790894
Epoch 5
Train Loss 0.23519307374954224 | Test Loss 0.25858792662620544
Epoch 6
Train Loss 0.2277502417564392 | Test Loss 0.2620530128479004
Epoch 7
Train Loss 0.22849349677562714 | Test Loss 0.2884455919265747
Epoch 8
Train Loss 0.22619009017944336 | Test Loss 0.2592795789241791
Epoch 9
Train Loss 0.2229718714952469 | Test Loss 0.41372033953666687


In [10]:
submission = pd.read_csv('sample_submission.csv')

In [11]:
to_predict = pd.read_csv('test_clean.csv').values
to_predict

array([[-0.87441099,  2.        ,  0.5103331 , ...,  1.71331792,
         8.        ,  4.        ],
       [-0.87441099,  3.        ,  0.55502216, ...,  1.71331792,
         8.        ,  4.        ],
       [ 0.06132983,  3.        ,  0.24219874, ...,  1.71331792,
         8.        ,  4.        ],
       ...,
       [-0.87441099,  3.        ,  4.08545789, ..., -1.35949197,
         8.        ,  0.        ],
       [ 0.64616784,  3.        , -0.29406997, ..., -1.35949197,
         8.        ,  4.        ],
       [ 0.06132983,  3.        ,  0.24219874, ..., -1.35949197,
         8.        ,  4.        ]])

In [12]:
train_original = pd.read_csv('train.csv')

In [13]:
with torch.inference_mode():
    predictions = model_0(torch.Tensor(to_predict).to(dtype=torch.float32, device=device))
    predictions *= train_original['SalePrice'].std()
    predictions += train_original['SalePrice'].mean()

In [14]:
predictions

tensor([[133036.8281],
        [179098.2656],
        [185760.4688],
        ...,
        [198211.5781],
        [118200.0000],
        [246225.3125]], device='cuda:0')

In [24]:
test_original = pd.read_csv('test.csv')
test_original['Id'].values

array([1461, 1462, 1463, ..., 2917, 2918, 2919])

In [25]:
result = pd.DataFrame({'Id': test_original['Id'].values, 'SalePrice': predictions.to(device='cpu').squeeze().numpy()})

In [26]:
result

Unnamed: 0,Id,SalePrice
0,1461,133036.828125
1,1462,179098.265625
2,1463,185760.468750
3,1464,202504.703125
4,1465,187498.531250
...,...,...
1454,2915,74899.562500
1455,2916,68369.796875
1456,2917,198211.578125
1457,2918,118200.000000


In [28]:
result.to_csv('submission.csv', index=False)