In [1]:
import os
import glob
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from google.colab import files
from PIL import Image
import matplotlib.pyplot as plt
import cv2
from google.colab.patches import cv2_imshow

In [2]:
! pip install -q kaggle
files.upload() #upload kaggle.json file
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [3]:
def downloadKaggleData(link):
  name = (link.split(' ')[-1]).split('/')[-1]
  ! {link}
  ! mkdir {name}
  ! unzip {name}.zip -d {name}
downloadKaggleData("kaggle competitions download -c house-prices-advanced-regression-techniques")

Downloading house-prices-advanced-regression-techniques.zip to /content
  0% 0.00/199k [00:00<?, ?B/s]
100% 199k/199k [00:00<00:00, 23.3MB/s]
Archive:  house-prices-advanced-regression-techniques.zip
  inflating: house-prices-advanced-regression-techniques/data_description.txt  
  inflating: house-prices-advanced-regression-techniques/sample_submission.csv  
  inflating: house-prices-advanced-regression-techniques/test.csv  
  inflating: house-prices-advanced-regression-techniques/train.csv  


In [50]:
import torch
from torch.utils.data import Dataset, DataLoader, Sampler
from sklearn.preprocessing import OneHotEncoder

class TrainDataset(Dataset):
  def __init__(self, dataLink):
    #Data cleaning taken from https://www.geeksforgeeks.org/house-price-prediction-using-machine-learning-in-python/
    #with some improvement by me (Ryan) inspired from https://www.kaggle.com/code/srivignesh/data-preprocessing-for-house-price-prediction
    dataset = pd.read_csv(dataLink)

    #Drop unnecessary ID column
    dataset.drop(['Id'],
             axis=1,
             inplace=True)

    #Fill in nan in saleprice
    dataset['SalePrice'] = dataset['SalePrice'].fillna(dataset['SalePrice'].mean())

    #Drop columns with more than 70% nan
    column_with_multiple_nans = dataset.isnull().sum() > 0.7 * dataset.shape[0]
    column_with_multiple_nans = list(column_with_multiple_nans[column_with_multiple_nans].index)
    dataset = dataset.drop(column_with_multiple_nans, axis=1)

    #Fill nan for the rest of the dataset
    for col in dataset:
        dataset[col] = dataset[col].fillna(dataset[col].mode()[0])

    #opposite of several cold decoder
    s = (dataset.dtypes == 'object')
    object_cols = list(s[s].index)
    label_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    label_encoded_columns = pd.DataFrame(label_encoder.fit_transform(dataset[object_cols]))
    label_encoded_columns.columns = label_encoder.get_feature_names_out()
    dataset = dataset.drop(object_cols, axis=1)
    dataset = pd.concat([dataset, label_encoded_columns], axis=1)


    self.x = dataset.drop('SalePrice', axis=1)
    self.y = dataset['SalePrice']
    self.x = torch.tensor(self.x.values, dtype=torch.float32)
    self.y = torch.tensor(self.y.values, dtype=torch.float32)

    self.y = self.y.reshape(-1, 1)

  def __len__(self):
    return self.x.shape[0]
  def __getitem__(self, ind):
    x = self.x[ind]
    y = self.y[ind] / 100000
    return x, y

train_set = TrainDataset("house-prices-advanced-regression-techniques/train.csv")

batch_size = 128
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
for X, y in train_loader:
    print(f"Shape of X [N, C, H, W]: {X.shape} {X.dtype}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break
print(train_set[0])

Shape of X [N, C, H, W]: torch.Size([128, 274]) torch.float32
Shape of y: torch.Size([128, 1]) torch.float32
(tensor([6.0000e+01, 6.5000e+01, 8.4500e+03, 7.0000e+00, 5.0000e+00, 2.0030e+03,
        2.0030e+03, 1.9600e+02, 7.0600e+02, 0.0000e+00, 1.5000e+02, 8.5600e+02,
        8.5600e+02, 8.5400e+02, 0.0000e+00, 1.7100e+03, 1.0000e+00, 0.0000e+00,
        2.0000e+00, 1.0000e+00, 3.0000e+00, 1.0000e+00, 8.0000e+00, 0.0000e+00,
        2.0030e+03, 2.0000e+00, 5.4800e+02, 0.0000e+00, 6.1000e+01, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 2.0000e+00, 2.0080e+03,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
        1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        1.

In [45]:
import torch
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using {device} device')

class MLP(nn.Module):
  def __init__(self) -> None:
    super(MLP, self).__init__()
    inputlayer = [274]
    hiddenlayer = []
    outputlayer = 1
    hiddenlayer = inputlayer + hiddenlayer

    self.layers = nn.Sequential()
    for i in range(len(hiddenlayer)-1):
      self.layers.add_module(f'linear_{i+1}', nn.Linear(hiddenlayer[i], hiddenlayer[i+1]))
      self.layers.add_module(f'relu_{i+1}', nn.ReLU())
    self.layers.add_module(f'linear_last', nn.Linear(hiddenlayer[-1], outputlayer))

  def forward(self, x):
    x = x.float()
    return self.layers(x)

model = MLP().to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-9)

print(model)

Using cpu device
MLP(
  (layers): Sequential(
    (linear_last): Linear(in_features=274, out_features=1, bias=True)
  )
)


In [16]:
def train(dataloader, model, loss_fn, optimizer):
  model.train()

  num_batches = len(dataloader)
  num_items = len(dataloader.dataset)

  total_loss = 0
  total_correct = 0

  losses = []
  for batch_num, (X, y) in enumerate(dataloader):
    optimizer.zero_grad()
    X, y = X.to(device), y.to(device)
    output = model(X)
    loss = loss_fn(output, y)
    total_loss += loss.item()
    loss.backward()
    losses.append(loss.item())
    optimizer.step()

  train_loss = total_loss/num_batches

  print(f"Average loss: {train_loss:7f}")



In [51]:
epochs = 100
for epoch in range(epochs):
    print(f"Training epoch: {epoch+1}")
    train(train_loader, model, criterion, optimizer)

Training epoch: 1
Average loss: 256.218465
Training epoch: 2
Average loss: 260.973321
Training epoch: 3
Average loss: 253.200036
Training epoch: 4
Average loss: 250.238861
Training epoch: 5
Average loss: 254.090734
Training epoch: 6
Average loss: 247.821211
Training epoch: 7
Average loss: 245.042815
Training epoch: 8
Average loss: 247.906298
Training epoch: 9
Average loss: 248.399028
Training epoch: 10
Average loss: 251.078547
Training epoch: 11
Average loss: 246.318303
Training epoch: 12
Average loss: 248.495837
Training epoch: 13
Average loss: 244.978821
Training epoch: 14
Average loss: 241.099609
Training epoch: 15
Average loss: 245.806418
Training epoch: 16
Average loss: 248.283746
Training epoch: 17
Average loss: 248.005009
Training epoch: 18
Average loss: 252.572923
Training epoch: 19
Average loss: 250.227187
Training epoch: 20
Average loss: 244.817192
Training epoch: 21
Average loss: 243.179015
Training epoch: 22
Average loss: 243.316545
Training epoch: 23
Average loss: 243.3136