# Setup

In [None]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.15.12-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.40-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.33.0-py2.py3-none-any.whl (243 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.2/243.2 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting pathtools (from wandb)
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.ma

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import wandb

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from torch.utils.data import TensorDataset, DataLoader

device = torch.device("cuda")
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x789118ba6970>

In [None]:
train = pd.read_csv('/content/drive/MyDrive/ME /SinoPac/30_Training Dataset_V2/training_data_v1017.csv') # training data
testpub = pd.read_csv('/content/drive/MyDrive/ME /SinoPac/public_dataset_v1017.csv')                     # testing data (public leader board)
subpub = pd.read_csv('/content/drive/MyDrive/ME /SinoPac/public_submission_template.csv')                # submission file

# Preprocessing

In [None]:
print('train_shape  :', train.shape)
print('testpub_shape:', testpub.shape)
print('subpub_shape :', subpub.shape)

train_shape  : (11751, 30)
testpub_shape: (5876, 29)
subpub_shape : (5876, 2)


In [None]:
df = train.copy()
df_test = testpub.copy()
column_name_mapping = {
    '縣市': 'County',
    '鄉鎮市區': 'District',
    '路名': 'Road',
    '土地面積': 'Land Area',
    '使用分區': 'Use Partitions',
    '移轉層次': 'Floor Level',
    '總樓層數': 'Total Floors',
    '主要用途': 'Main Purpose',
    '主要建材': 'Main Materials',
    '建物型態': 'Building Type',
    '屋齡': 'House Age',
    '建物面積': 'Building Area',
    '車位面積': 'Parking Area',
    '車位個數': 'Num of Parking',
    '橫坐標': 'x-axis',
    '縱坐標': 'y-axis',
    '主建物面積': 'Main Building Area',
    '陽台面積': 'Balcony Area',
    '附屬建物面積': 'Ancillary Building Area',
    '單價': 'Price'
}
df.rename(columns=column_name_mapping, inplace=True)
df_test.rename(columns=column_name_mapping, inplace=True)

df = df.drop(['備註'], axis=1)
df_test = df_test.drop(['備註'], axis=1)

In [None]:
df = pd.DataFrame(df)
df_test = pd.DataFrame(df_test)

df['ID'] = df['ID'].apply(lambda x: int(x.replace('TR-', '')) if isinstance(x, str) and x.startswith('TR-') else x)
df_test['ID'] = df_test['ID'].apply(lambda x: int(x.replace('PU-', '')) if isinstance(x, str) and x.startswith('PU-') else x)

label_encoder = LabelEncoder()
object_columns = df.select_dtypes(include=['object']).columns

for column in object_columns:
    combined_data = pd.concat([df[column], df_test[column]], axis=0)
    label_encoder.fit(combined_data)

    df[column] = label_encoder.transform(df[column])
    df_test[column] = label_encoder.transform(df_test[column])

In [None]:
correlation_matrix = df.corr()
correlation_with_target = correlation_matrix['Price']
print(correlation_with_target)

ID                            -0.002611
County                        -0.459016
District                      -0.235662
Road                          -0.035781
Land Area                      0.067088
Use Partitions                 0.008904
Floor Level                    0.054038
Total Floors                   0.037874
Main Purpose                  -0.057103
Main Materials                 0.011903
Building Type                  0.016075
House Age                      0.049196
Building Area                  0.061327
Parking Area                   0.099986
Num of Parking                 0.013064
x-axis                         0.455633
y-axis                         0.394402
Main Building Area             0.056150
Balcony Area                   0.009866
Ancillary Building Area        0.064905
Price                          1.000000
lng                            0.456070
lat                            0.394313
ATM Count within 10km          0.667817
Bank Count within 10km         0.689029


In [None]:
# Data Spliting
X = df.drop(['Price','ID','County','District'], axis=1)
df_test = df_test.drop(['ID','County','District'], axis=1)
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, shuffle=True)

In [None]:
# Switch to tensor
X_train = X_train.values
y_train = y_train.values.reshape(-1, 1)
X_test = X_test.values
y_test = y_test.values.reshape(-1, 1)

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

# Built Dataset
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

# Model and Train Function

In [None]:
# MLP
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, dropout_rate=0.5):
        super(MLP, self).__init__()

        # Define input layer
        self.input_layer = nn.Linear(input_size, hidden_size)

        # Define hidden layers
        self.hidden_layers = nn.ModuleList([
            nn.Sequential(
                nn.Linear(hidden_size, hidden_size),
                nn.ReLU(),
                nn.BatchNorm1d(hidden_size),
                nn.Dropout(p=dropout_rate)
            )
            for _ in range(num_layers)
        ])

        # Define output layer
        self.output_layer = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Input layer
        x = F.relu(self.input_layer(x))

        # Hidden layers
        for layer in self.hidden_layers:
            x = layer(x)

        # Output layer
        x = self.output_layer(x)
        return x

In [None]:
def train_one_epoch(train_data_loader):
  epoch_loss = []
  epoch_mape = []
  trues = []
  prediction = []
  start_time = time.time()
  model.train()

  for batch_idx, (variables, labels) in enumerate(train_data_loader):
    variables = variables.to(device)
    labels = labels.to(device)
    optimizer.zero_grad()

    # forward
    preds = model(variables)
    loss = criterion(preds, labels)
    epoch_loss.append(loss.item())

    preds = preds.detach().cpu().numpy()
    labels = labels.detach().cpu().numpy()
    mape_e = mean_absolute_percentage_error(labels, preds)
    epoch_mape.append(mape_e)
    trues.append(labels)
    prediction.append(preds)

    # backward
    loss.backward()
    optimizer.step()

  trues = np.concatenate(trues)
  prediction = np.concatenate(prediction)
  mape = mean_absolute_percentage_error(trues, prediction)
  end_time = time.time()
  total_time = end_time - start_time
  epoch_loss = np.mean(epoch_loss)
  wandb.log({"train_loss": epoch_loss, "train_mape": mape})

  return epoch_loss, mape, total_time

def val_one_epoch(val_data_loader, best_mape):
  epoch_loss = []
  epoch_mape = []
  trues = []
  prediction = []
  start_time = time.time()
  model.eval()

  with torch.no_grad():
    for batch_idx, (variables, labels) in enumerate(val_data_loader):
      variables = variables.to(device)
      labels = labels.to(device)
      preds = model(variables)

      loss = criterion(preds, labels)
      epoch_loss.append(loss.item())

      preds = preds.detach().cpu().numpy()
      labels = labels.detach().cpu().numpy()
      mape_e = mean_absolute_percentage_error(labels, preds)
      epoch_mape.append(mape_e)
      trues.append(labels)
      prediction.append(preds)

  trues = np.concatenate(trues)
  prediction = np.concatenate(prediction)
  mape = mean_absolute_percentage_error(trues, prediction)
  mape = mape * 100
  end_time = time.time()
  total_time = end_time - start_time
  epoch_loss = np.mean(epoch_loss)
  wandb.log({"test_loss": epoch_loss, "test_mape": mape})

  if best_mape is None:
    best_mape = mape
    torch.save(model.state_dict(), "model.pth")
  elif mape < best_mape:
    best_mape = mape
    torch.save(model.state_dict(), "model.pth")

  return epoch_loss, mape, total_time, best_mape

# Model Training

In [None]:
# Define parameters
batch_size = 64
epochs = 500
lr = 0.00001
weight_decay = 0.0001

input_size = X_train.shape[1]
hidden_size = 128
num_layers = 3
output_size = 1

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model = MLP(input_size, hidden_size, output_size, num_layers)
model = model.to(device)
criterion = nn.SmoothL1Loss()
optimizer = torch.optim.RAdam(model.parameters(), lr=lr, weight_decay=weight_decay)
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=10, mode='max')

In [None]:
wandb.login()
wandb.init(project='SinoPac_comp',
           config={
               "learning_rate": lr,
               "epochs": epochs,
               "batch_size": batch_size,
               "weight_decay": weight_decay,
           },
           name='exp15')

In [None]:
best_mape = None
history = []

for epoch in range(epochs):
  train_loss, train_mape, train_time = train_one_epoch(train_loader)
  val_loss, val_mape, val_time, best_mape = val_one_epoch(test_loader, best_mape)

  total_time = train_time + val_time
  output_str = f"Epoch {epoch+1}/{epochs} - loss: {train_loss:.4f} - train_mape: {train_mape:.2f} - test_loss: {val_loss:.4f} - test_mape: {val_mape:.2f} - time: {total_time:.2f}s"
  history.append(output_str)
  print(output_str)

print()
print(f'MAPE : {best_mape:.4f}')

wandb.finish()

Epoch 1/500 - loss: 1.6184 - train_mape: 1.09 - test_loss: 1.5479 - test_mape: 105.27 - time: 0.61s
Epoch 2/500 - loss: 1.5983 - train_mape: 1.08 - test_loss: 1.5423 - test_mape: 104.71 - time: 0.64s
Epoch 3/500 - loss: 1.5957 - train_mape: 1.08 - test_loss: 1.5545 - test_mape: 106.64 - time: 0.80s
Epoch 4/500 - loss: 1.5900 - train_mape: 1.09 - test_loss: 1.5143 - test_mape: 104.99 - time: 0.83s
Epoch 5/500 - loss: 1.5786 - train_mape: 1.08 - test_loss: 1.5029 - test_mape: 105.03 - time: 0.80s
Epoch 6/500 - loss: 1.5672 - train_mape: 1.08 - test_loss: 1.5285 - test_mape: 106.77 - time: 0.81s
Epoch 7/500 - loss: 1.5644 - train_mape: 1.08 - test_loss: 1.5203 - test_mape: 106.99 - time: 0.83s
Epoch 8/500 - loss: 1.5613 - train_mape: 1.09 - test_loss: 1.6417 - test_mape: 114.30 - time: 0.63s
Epoch 9/500 - loss: 1.5506 - train_mape: 1.09 - test_loss: 1.7212 - test_mape: 119.50 - time: 0.56s
Epoch 10/500 - loss: 1.5410 - train_mape: 1.08 - test_loss: 1.4708 - test_mape: 105.87 - time: 0.56s

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
test_loss,▅▅█▄▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁
test_mape,▄▄█▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,██▇▆▆▅▄▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_mape,██▇▇▆▅▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
test_loss,0.29588
test_mape,25.87645
train_loss,0.23529
train_mape,0.29439


# Test and Submission