# Setup

In [None]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.15.12-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.40-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.33.0-py2.py3-none-any.whl (243 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.2/243.2 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting pathtools (from wandb)
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.ma

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import wandb

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from torch.utils.data import TensorDataset, DataLoader

device = torch.device("cuda")
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x789118ba6970>

In [None]:
train = pd.read_csv('/content/drive/MyDrive/ME /SinoPac/30_Training Dataset_V2/training_data_v1104v2.csv')
testpub = pd.read_csv('/content/drive/MyDrive/ME /SinoPac/public test/public_dataset_v1104v3.csv')
testprv = pd.read_csv('/content/drive/MyDrive/ME /SinoPac/private test/private_dataset_v3.csv')
subpub = pd.read_csv('/content/drive/MyDrive/ME /SinoPac/public_submission_template.csv')
subprv = pd.read_csv('/content/drive/MyDrive/ME /SinoPac/private_submission_template.csv')
subpubprv = pd.read_csv('/content/drive/MyDrive/ME /SinoPac/public_private_submission_template.csv')

# Preprocessing

In [None]:
print('train:', train.shape)
print('public test :', testpub.shape)
print('private test:', testprv.shape)
print('public sub  :', subpub.shape)
print('public sub  :', subprv.shape)
print('pubic_private sub :', subpubprv.shape)

In [None]:
# Change column name
df_train = train.copy()
df_testpub = testpub.copy()
df_testprv = testprv.copy()

column_name_mapping = {
    '縣市': 'County',
    '鄉鎮市區': 'District',
    '路名': 'Road',
    '土地面積': 'Land Area',
    '使用分區': 'Use Partitions',
    '移轉層次': 'Floor Level',
    '總樓層數': 'Total Floors',
    '主要用途': 'Main Purpose',
    '主要建材': 'Main Materials',
    '建物型態': 'Building Type',
    '屋齡': 'House Age',
    '建物面積': 'Building Area',
    '車位面積': 'Parking Area',
    '車位個數': 'Num of Parking',
    '橫坐標': 'x-axis',
    '縱坐標': 'y-axis',
    '主建物面積': 'Main Building Area',
    '陽台面積': 'Balcony Area',
    '附屬建物面積': 'Ancillary Building Area',
    '單價': 'Price'
}

df_train.rename(columns=column_name_mapping, inplace=True)
df_testpub.rename(columns=column_name_mapping, inplace=True)
df_testprv.rename(columns=column_name_mapping, inplace=True)

df_train = df_train.drop(['備註'], axis=1)
df_testpub = df_testpub.drop(['備註'], axis=1)
df_testprv = df_testprv.drop(['備註'], axis=1)

print('train:', df_train.shape)
print('public test :', df_testpub.shape)
print('private test:', df_testprv.shape)

In [None]:
df_train = pd.DataFrame(df_train)
df_testpub = pd.DataFrame(df_testpub)
df_testprv = pd.DataFrame(df_testprv)

df_train['ID'] = df_train['ID'].apply(lambda x: int(x.replace('TR-', '')) if isinstance(x, str) and x.startswith('TR-') else x)
df_testpub['ID'] = df_testpub['ID'].apply(lambda x: int(x.replace('PU-', '')) if isinstance(x, str) and x.startswith('PU-') else x)
df_testprv['ID'] = df_testprv['ID'].apply(lambda x: int(x.replace('PR-', '')) if isinstance(x, str) and x.startswith('PR-') else x)

numeric_features = ['Land Area', 'Floor Level', 'Total Floors', 'House Age', 'Building Area', 'Parking Area', 'Num of Parking', 'Main Building Area',
                    'Balcony Area', 'Ancillary Building Area']
train_numeric = df_train[numeric_features]
testpub_numeric = df_testpub[numeric_features]

scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_numeric)
testpub_scaled = scaler.transform(testpub_numeric)

df_train[numeric_features] = train_scaled
df_testpub[numeric_features] = testpub_scaled

label_encoder = LabelEncoder()
object_columns = df_train.select_dtypes(include=['object']).columns

for column in object_columns:
    combined_data = pd.concat([df_train[column], df_testpub[column], df_testprv[column]], axis=0)
    label_encoder.fit(combined_data)

    df_train[column] = label_encoder.transform(df_train[column])
    df_testpub[column] = label_encoder.transform(df_testpub[column])
    df_testprv[column] = label_encoder.transform(df_testprv[column])

In [None]:
# finding correlation of every variables with target
correlation_matrix = df_train.corr()
correlation_with_target = correlation_matrix['Price']
print(correlation_with_target)

In [None]:
df2_train = df_train.copy()
df2_testpub = df_testpub.copy()
df2_testprv = df_testprv.copy()

df2_train = df2_train.drop(['ID', 'Use Partitions', 'Balcony Area'], axis=1)
df2_testpub = df2_testpub.drop(['ID', 'Use Partitions', 'Balcony Area'], axis=1)
df2_testprv = df2_testprv.drop(['ID', 'Use Partitions', 'Balcony Area'], axis=1)

print('train:', df2_train.shape)
print('public test :', df2_testpub.shape)
print('private test:', df2_testprv.shape)

# Data Spliting
X = df2_train.drop('Price', axis=1)
y = df2_train['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state = 42, shuffle=True)

In [None]:
# Switch to tensor
X_train = X_train.values
y_train = y_train.values.reshape(-1, 1)
X_test = X_test.values
y_test = y_test.values.reshape(-1, 1)

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

# Built Dataset
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

# Model and Train Function

In [None]:
# MLP
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, dropout_rate=0.5):
        super(MLP, self).__init__()

        # Define input layer
        self.input_layer = nn.Linear(input_size, hidden_size)

        # Define hidden layers
        self.hidden_layers = nn.ModuleList([
            nn.Sequential(
                nn.Linear(hidden_size, hidden_size),
                nn.ReLU(),
                nn.BatchNorm1d(hidden_size),
                nn.Dropout(p=dropout_rate)
            )
            for _ in range(num_layers)
        ])

        # Define output layer
        self.output_layer = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Input layer
        x = F.relu(self.input_layer(x))

        # Hidden layers
        for layer in self.hidden_layers:
            x = layer(x)

        # Output layer
        x = self.output_layer(x)
        return x

In [None]:
def train_one_epoch(train_data_loader):
  epoch_loss = []
  epoch_mape = []
  trues = []
  prediction = []
  start_time = time.time()
  model.train()

  for batch_idx, (variables, labels) in enumerate(train_data_loader):
    variables = variables.to(device)
    labels = labels.to(device)
    optimizer.zero_grad()

    # forward
    preds = model(variables)
    loss = criterion(preds, labels)
    epoch_loss.append(loss.item())

    preds = preds.detach().cpu().numpy()
    labels = labels.detach().cpu().numpy()
    mape_e = mean_absolute_percentage_error(labels, preds)
    epoch_mape.append(mape_e)
    trues.append(labels)
    prediction.append(preds)

    # backward
    loss.backward()
    optimizer.step()

  trues = np.concatenate(trues)
  prediction = np.concatenate(prediction)
  mape = mean_absolute_percentage_error(trues, prediction)
  end_time = time.time()
  total_time = end_time - start_time
  epoch_loss = np.mean(epoch_loss)
  wandb.log({"train_loss": epoch_loss, "train_mape": mape})

  return epoch_loss, mape, total_time

def val_one_epoch(val_data_loader, best_mape):
  epoch_loss = []
  epoch_mape = []
  trues = []
  prediction = []
  start_time = time.time()
  model.eval()

  with torch.no_grad():
    for batch_idx, (variables, labels) in enumerate(val_data_loader):
      variables = variables.to(device)
      labels = labels.to(device)
      preds = model(variables)

      loss = criterion(preds, labels)
      epoch_loss.append(loss.item())

      preds = preds.detach().cpu().numpy()
      labels = labels.detach().cpu().numpy()
      mape_e = mean_absolute_percentage_error(labels, preds)
      epoch_mape.append(mape_e)
      trues.append(labels)
      prediction.append(preds)

  trues = np.concatenate(trues)
  prediction = np.concatenate(prediction)
  mape = mean_absolute_percentage_error(trues, prediction)
  mape = mape * 100
  end_time = time.time()
  total_time = end_time - start_time
  epoch_loss = np.mean(epoch_loss)
  wandb.log({"test_loss": epoch_loss, "test_mape": mape})

  if best_mape is None:
    best_mape = mape
    torch.save(model.state_dict(), "model.pth")
  elif mape < best_mape:
    best_mape = mape
    torch.save(model.state_dict(), "model.pth")

  return epoch_loss, mape, total_time, best_mape

# Model Training

In [None]:
# Define parameters
batch_size = 64
epochs = 500
lr = 0.00001
weight_decay = 0.0001

input_size = X_train.shape[1]
hidden_size = 128
num_layers = 3
output_size = 1

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model = MLP(input_size, hidden_size, output_size, num_layers)
model = model.to(device)
criterion = nn.SmoothL1Loss()
optimizer = torch.optim.RAdam(model.parameters(), lr=lr, weight_decay=weight_decay)
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=10, mode='max')

In [None]:
wandb.login()
wandb.init(project='SinoPac_comp',
           config={
               "learning_rate": lr,
               "epochs": epochs,
               "batch_size": batch_size,
               "weight_decay": weight_decay,
           },
           name='exp15')

In [None]:
best_mape = None
history = []

for epoch in range(epochs):
  train_loss, train_mape, train_time = train_one_epoch(train_loader)
  val_loss, val_mape, val_time, best_mape = val_one_epoch(test_loader, best_mape)

  total_time = train_time + val_time
  output_str = f"Epoch {epoch+1}/{epochs} - loss: {train_loss:.4f} - train_mape: {train_mape:.2f} - test_loss: {val_loss:.4f} - test_mape: {val_mape:.2f} - time: {total_time:.2f}s"
  history.append(output_str)
  print(output_str)

print()
print(f'MAPE : {best_mape:.4f}')

wandb.finish()