# House Price Prediction Kaggle
- Minh Nguyen
- 11/20/2024
- https://www.kaggle.com/c/house-prices-advanced-regression-techniques

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch.nn as nn
import torch.optim as optim

# Load dataset
train_data = pd.read_csv("data/kaggle_house/train.csv")
test_data = pd.read_csv("data/kaggle_house/test.csv")

# Drop ID and separate label
train_data.drop(columns=['Id'], inplace=True)
test_ids = test_data['Id']
test_data.drop(columns=['Id'], inplace=True)

# Identify numeric columns excluding 'SalePrice'
numeric_cols = train_data.select_dtypes(include=[np.number]).columns
numeric_cols = numeric_cols.drop('SalePrice')  # Drop 'SalePrice' as it doesn't exist in test_data

# Fill missing values with column mean for both datasets
train_data[numeric_cols] = train_data[numeric_cols].fillna(train_data[numeric_cols].mean())
test_data[numeric_cols] = test_data[numeric_cols].fillna(test_data[numeric_cols].mean())

# One-hot encode categorical variables
train_data = pd.get_dummies(train_data, dummy_na=True)
test_data = pd.get_dummies(test_data, dummy_na=True)

# Save SalePrice separately
labels = np.log1p(train_data['SalePrice'].values)  # Use log-transformed prices
train_data.drop(columns=['SalePrice'], inplace=True)

# Align train and test data (Exclude SalePrice from the alignment operation)
train_data, test_data = train_data.align(test_data, join='left', axis=1, fill_value=0)

# Standardize numerical features
scaler = StandardScaler()
features = scaler.fit_transform(train_data.values)
test_data = scaler.transform(test_data.values)

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=42)

# Debug output to verify dimensions
print(f"Training data shape: {X_train.shape}, Validation data shape: {X_val.shape}")
print(f"Test data shape: {test_data.shape}")


Training data shape: (1168, 330), Validation data shape: (292, 330)
Test data shape: (1459, 330)


In [2]:
class HouseDataset(Dataset):
    def __init__(self, features, labels=None):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32) if labels is not None else None

    def __len__(self):
        return self.features.shape[0]

    def __getitem__(self, idx):
        if self.labels is not None:
            return self.features[idx], self.labels[idx]
        else:
            return self.features[idx]

# Create datasets
train_dataset = HouseDataset(X_train, y_train)
val_dataset = HouseDataset(X_val, y_val)
test_dataset = HouseDataset(test_data)

# Create data loaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

for X, y in train_loader:
    print(X.shape)
    print(y.shape)
    break

torch.Size([64, 330])
torch.Size([64])


In [3]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size_1, hidden_size_2, output_size, dropout_rate=0.2):
        super(MLP, self).__init__()
        self.hidden_layer_1 = nn.Linear(input_size, hidden_size_1)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(p=dropout_rate)
        self.hidden_layer_2 = nn.Linear(hidden_size_1, hidden_size_2)
        self.output_layer = nn.Linear(hidden_size_2, output_size)
    
    def forward(self, X):
        X = self.hidden_layer_1(X) # 330 x hidden_size 1
        X = self.activation(X)
        X = self.dropout(X)
        X = self.hidden_layer_2(X) # hidden_size 1 x hidden_size 2
        X = self.activation(X)
        X = self.dropout(X)
        logits = self.output_layer(X)
        return logits

In [4]:
def train_model(model, train_loader, val_loader, num_epochs=100, learning_rate=0.001):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * X_batch.size(0)

        # Validation phase
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                y_pred = model(X_batch)
                loss = criterion(y_pred, y_batch)
                val_loss += loss.item() * X_batch.size(0)

        train_loss /= len(train_loader.dataset)
        val_loss /= len(val_loader.dataset)
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

# Initialize and train the model
input_dim = X_train.shape[1] # 330
model = MLP(input_size=input_dim, hidden_size_1=128, hidden_size_2=64, output_size=1)
train_model(model, train_loader, val_loader, num_epochs=50, learning_rate=0.001)


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1/50, Train Loss: 127.2579, Val Loss: 96.4406
Epoch 2/50, Train Loss: 54.5261, Val Loss: 12.1233
Epoch 3/50, Train Loss: 8.7659, Val Loss: 3.5954
Epoch 4/50, Train Loss: 4.9562, Val Loss: 3.0329
Epoch 5/50, Train Loss: 3.7064, Val Loss: 2.3235
Epoch 6/50, Train Loss: 3.1308, Val Loss: 2.4232
Epoch 7/50, Train Loss: 3.0921, Val Loss: 2.4270
Epoch 8/50, Train Loss: 2.6676, Val Loss: 2.1118
Epoch 9/50, Train Loss: 2.8439, Val Loss: 2.1467
Epoch 10/50, Train Loss: 2.8111, Val Loss: 2.1340
Epoch 11/50, Train Loss: 2.6964, Val Loss: 1.9729
Epoch 12/50, Train Loss: 2.6093, Val Loss: 2.4092
Epoch 13/50, Train Loss: 2.5766, Val Loss: 1.8561
Epoch 14/50, Train Loss: 2.5639, Val Loss: 2.0240
Epoch 15/50, Train Loss: 2.5398, Val Loss: 2.1079
Epoch 16/50, Train Loss: 2.6829, Val Loss: 2.0138
Epoch 17/50, Train Loss: 2.6460, Val Loss: 2.0189
Epoch 18/50, Train Loss: 2.4348, Val Loss: 2.2694
Epoch 19/50, Train Loss: 2.4555, Val Loss: 2.2041
Epoch 20/50, Train Loss: 2.3184, Val Loss: 2.0577
Epoc

In [5]:
def predict(model, test_loader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for X_batch in test_loader:
            y_pred = model(X_batch).numpy().flatten()  # Ensure predictions are 1D
            predictions.extend(y_pred)
    return predictions

# Predict and generate submission file
predictions = predict(model, test_loader)
predictions = np.expm1(predictions)  # Reverse log transformation

# Ensure predictions are 1D
predictions = np.array(predictions).ravel()  # Flatten if needed
submission = pd.DataFrame({'Id': test_ids, 'SalePrice': predictions})

submission.to_csv('data/kaggle_house/submission.csv', index=False)
print("Submission file created: submission.csv")


Submission file created: submission.csv
