In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt

In [6]:
# Identify categorical and numerical columns
categorical_cols = ['flat_model']
numerical_cols = ['time', 'storey_avg', 'floor_area_sqm', 'flat_type_encoded', 'remaining_lease_months']

# Create column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

In [8]:
train_data = pd.read_csv('../../data/processed/train.csv')
test_data = pd.read_csv('../../data/processed/test.csv')

X_train = train_data[numerical_cols + categorical_cols]
y_train = train_data['resale_price']
X_test = test_data[numerical_cols + categorical_cols]
y_test = test_data['resale_price']

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [10]:
# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_processed.toarray())
y_train_tensor = torch.FloatTensor(y_train.values).view(-1, 1)
X_test_tensor = torch.FloatTensor(X_test_processed.toarray())
y_test_tensor = torch.FloatTensor(y_test.values).view(-1, 1)

# Create Dataset class
class HDBDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create DataLoaders
train_dataset = HDBDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataset = HDBDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Get input dimension
input_dim = X_train_processed.shape[1]
print(f"Neural network input dimension: {input_dim}")

Neural network input dimension: 26


In [11]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim):
        super(NeuralNetwork, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.3),
            
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.2),
            
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            
            nn.Linear(32, 1)
        )
    
    def forward(self, x):
        return self.layers(x)

# Initialize model, loss function, and optimizer
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = NeuralNetwork(input_dim).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

Using device: cpu


In [13]:
def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    for X, y in loader:
        X, y = X.to(device), y.to(device)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(X)
        loss = criterion(outputs, y)
        
        # Backward and optimize
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * X.size(0)
    
    return running_loss / len(loader.dataset)

def validate(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    with torch.no_grad():
        for X, y in loader:
            X, y = X.to(device), y.to(device)
            outputs = model(X)
            loss = criterion(outputs, y)
            running_loss += loss.item() * X.size(0)
    
    return running_loss / len(loader.dataset)

# Training loop
num_epochs = 100
train_losses = []
val_losses = []
best_val_loss = float('inf')
patience = 15
patience_counter = 0
best_model_state = None

In [14]:
# Create a validation set from the training data
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_subset, val_subset = torch.utils.data.random_split(train_dataset, [train_size, val_size])
train_loader = DataLoader(train_subset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_subset, batch_size=32, shuffle=False)

for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss = validate(model, val_loader, criterion, device)
    
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    
    print(f'Epoch {epoch+1}/{num_epochs}, '
          f'Train Loss: {train_loss:.4f}, '
          f'Val Loss: {val_loss:.4f}')
    
    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model_state = model.state_dict().copy()
        patience_counter = 0
    else:
        patience_counter += 1
        
    if patience_counter >= patience:
        print(f'Early stopping after {epoch+1} epochs')
        break

# Load the best model
if best_model_state is not None:
    model.load_state_dict(best_model_state)

model.eval()
with torch.no_grad():
    nn_pred_train = model(X_train_tensor.to(device)).cpu().numpy().flatten()
    nn_pred_test = model(X_test_tensor.to(device)).cpu().numpy().flatten()

Epoch 1/100, Train Loss: 294941841322.6673, Val Loss: 293296671373.9879
Epoch 2/100, Train Loss: 292596916775.0331, Val Loss: 289755305893.1353
Epoch 3/100, Train Loss: 288674662767.5906, Val Loss: 283025060782.0085
Epoch 4/100, Train Loss: 283337148302.1029, Val Loss: 277335325885.7249
Epoch 5/100, Train Loss: 276668714387.8918, Val Loss: 271797137670.7490
Epoch 6/100, Train Loss: 268765548421.1824, Val Loss: 262610310993.3566
Epoch 7/100, Train Loss: 259699304038.0488, Val Loss: 244217881247.4836
Epoch 8/100, Train Loss: 249595090130.9801, Val Loss: 257488622044.2091
Epoch 9/100, Train Loss: 238612324149.2585, Val Loss: 238143528044.7054
Epoch 10/100, Train Loss: 226824235950.9551, Val Loss: 229098473342.8830
Epoch 11/100, Train Loss: 214370996794.3791, Val Loss: 222894646077.0076
Epoch 12/100, Train Loss: 201403621072.9224, Val Loss: 195614920160.3165
Epoch 13/100, Train Loss: 188161514497.8891, Val Loss: 192549708352.1666
Epoch 14/100, Train Loss: 174655045815.3641, Val Loss: 17572

In [16]:
def evaluate_model(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    print(f"\n{model_name} Evaluation Metrics:")
    print(f"MSE: {mse:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R²: {r2:.4f}")
    
    return mse, rmse, mae, r2

# Evaluate the model
print("\nTraining set evaluation:")
nn_train_metrics = evaluate_model(y_train, nn_pred_train, "Neural Network")
print("\nTest set evaluation:")
nn_test_metrics = evaluate_model(y_test, nn_pred_test, "Neural Network")


Training set evaluation:

Neural Network Evaluation Metrics:
MSE: 8647817847.78
RMSE: 92993.64
MAE: 69613.55
R²: 0.7336

Test set evaluation:

Neural Network Evaluation Metrics:
MSE: 9137940629.08
RMSE: 95592.58
MAE: 69576.80
R²: 0.7184
