# House Price Predictor

This notebook loads and preprocesses the training data (`train.csv`), trains a deep neural network model, tunes hyperparameters via grid search with early stopping, and finally predicts `SalePrice` for the data in `test.csv`. The final submission output is a DataFrame with two columns: `Id` and the predicted `SalePrice`.

In [21]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import itertools
import os

## 1. Load and Preprocess Training Data (train.csv)

We load `train.csv`, one-hot encode categorical features, drop missing values, and select the top features based on correlation with `SalePrice`.

In [22]:
# Load train.csv
train_df = pd.read_csv('train.csv')
target = 'SalePrice'

# Identify categorical columns (object or category type)
categorical_cols = train_df.select_dtypes(include=['object', 'category']).columns

# One-hot encode categorical variables (drop_first to avoid dummy variable trap)
data = pd.get_dummies(train_df, columns=categorical_cols, dtype=int, drop_first=True)

# Drop rows with missing values
data.dropna(inplace=True)

# Save the full list of feature columns (after dummy encoding) for later use
train_columns = data.drop(columns=[target]).columns

# Compute correlation matrix and select top features (top 20, adjust if needed)
corr_matrix = data.corr()
target_corr = corr_matrix[target].drop(target).abs().sort_values(ascending=False)
top_features = target_corr.head(15).index
print("Selected top features:", list(top_features))

# Define input features (X) and target (y) using the top features
X = data[top_features].values
y = data[target].values.reshape(-1, 1)

Selected top features: ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'ExterQual_TA', 'TotRmsAbvGrd', 'FullBath', 'KitchenQual_TA', 'YearRemodAdd', 'YearBuilt', 'Foundation_PConc', 'GarageFinish_Unf', 'GarageYrBlt']


## 2. Split Data: Train, Validation, and Test (from train.csv)

We split the data into 80% train+validation and 20% test, then split train+validation into approximately 60% training and 20% validation.

In [23]:
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.25, random_state=42)

## 3. Scale Features and Target

We use two separate StandardScalers: one for the features and one for the target (`SalePrice`).

In [24]:
# Scale features
scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_val   = scaler_X.transform(X_val)
X_test  = scaler_X.transform(X_test)

# Scale target
scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train)
y_val_scaled   = scaler_y.transform(y_val)
y_test_scaled  = scaler_y.transform(y_test)

## 4. Convert Data to PyTorch Tensors

We convert our numpy arrays into PyTorch tensors and create TensorDatasets for training and validation.

In [25]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_scaled, dtype=torch.float32)
X_val_tensor   = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor   = torch.tensor(y_val_scaled, dtype=torch.float32)
X_test_tensor  = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor  = torch.tensor(y_test_scaled, dtype=torch.float32)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset   = TensorDataset(X_val_tensor, y_val_tensor)

## 5. Define the Deep Neural Network Model

We define a deep neural network with three hidden layers and dropout.

In [26]:
class HousePriceNNDeep(nn.Module):
    def __init__(self, input_dim, hidden1=128, hidden2=64, dropout=0.5):
        super(HousePriceNNDeep, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x


## 6. Define the Training Function with Early Stopping and Weight Decay

This function trains the model using the provided hyperparameters, applies early stopping, and computes unscaled validation metrics.

In [27]:
def train_and_evaluate_deep(hparams, device='cpu', patience=20):
    """
    Trains the deep network using provided hyperparameters.
    Returns: best validation loss (scaled), model, unscaled validation MSE, unscaled validation R².
    hparams: dict with keys: 'learning_rate', 'batch_size', 'num_epochs', 'hidden1', 'hidden2', 'hidden3', 'dropout'
    """
    input_dim = X_train_tensor.shape[1]
    model = HousePriceNNDeep(input_dim,
                             hidden1=hparams['hidden1'],
                             hidden2=hparams['hidden2'],
                             dropout=hparams['dropout']).to(device)
    
    train_loader = DataLoader(train_dataset, batch_size=hparams['batch_size'], shuffle=True)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=hparams['learning_rate'], weight_decay=1e-5)
    
    best_val_loss = float('inf')
    epochs_without_improve = 0
    best_model_state = None
    
    for epoch in range(hparams['num_epochs']):
        model.train()
        running_loss = 0.0
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * batch_X.size(0)
        epoch_loss = running_loss / len(train_dataset)
        
        # Evaluate on validation set (scaled targets)
        model.eval()
        with torch.no_grad():
            val_preds = model(X_val_tensor.to(device))
            val_loss = criterion(val_preds, y_val_tensor.to(device)).item()
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_without_improve = 0
            best_model_state = model.state_dict()
        else:
            epochs_without_improve += 1
        
        if epochs_without_improve >= patience:
            print(f"Early stopping at epoch {epoch+1}, best validation loss (scaled): {best_val_loss:.4f}")
            break

    model.load_state_dict(best_model_state)
    
    # Compute unscaled performance on the validation set:
    model.eval()
    with torch.no_grad():
        val_preds = model(X_val_tensor.to(device))
        val_preds_np = val_preds.cpu().numpy()
        y_val_np = y_val_tensor.cpu().numpy()
        
        # Inverse-transform predictions and true values
        val_preds_unscaled = scaler_y.inverse_transform(val_preds_np)
        y_val_unscaled = scaler_y.inverse_transform(y_val_np)
        
        unscaled_val_mse = mean_squared_error(y_val_unscaled, val_preds_unscaled)
        unscaled_val_r2 = r2_score(y_val_unscaled, val_preds_unscaled)
    
    return best_val_loss, model, unscaled_val_mse, unscaled_val_r2

## 7. Hyperparameter Tuning via Grid Search

We iterate over a grid of hyperparameters to find the best settings based on validation performance.

In [28]:
import random

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

param_grid = {
    'learning_rate': [0.0005, 0.001, 0.005, 0.01],
    'batch_size': [16, 32, 64],
    'num_epochs': [100, 200, 500,1000],
    'hidden1': [128, 256, 512],
    'hidden2': [64, 128, 256],
    'dropout': [0.2, 0.3, 0.5]
}

best_val_loss = float('inf')
best_params = None
best_model_state = None
best_unscaled_val_mse = None
best_unscaled_val_r2 = None

n_iter = 20  # number of random combinations to try

for i in range(n_iter):
    # Randomly sample one value for each parameter:
    hparams = { key: random.choice(values) for key, values in param_grid.items() }
    
    val_loss_scaled, temp_model, val_mse_unscaled, val_r2_unscaled = train_and_evaluate_deep(hparams, device=device, patience=20)
    print(f"Iteration {i+1}: Params: {hparams} -> Scaled Val Loss: {val_loss_scaled:.4f} | Unscaled Val MSE: {val_mse_unscaled:.2f}, R²: {val_r2_unscaled:.4f}")
    
    if val_loss_scaled < best_val_loss:
        best_val_loss = val_loss_scaled
        best_params = hparams
        best_model_state = temp_model.state_dict()
        best_unscaled_val_mse = val_mse_unscaled
        best_unscaled_val_r2 = val_r2_unscaled

print("\nBest hyperparameters found for the deep network:")
print(best_params)
print(f"Best validation loss (scaled): {best_val_loss:.4f}")
print(f"Unscaled Validation MSE: {best_unscaled_val_mse:.2f}")
print(f"Unscaled Validation R²: {best_unscaled_val_r2:.4f}")


Early stopping at epoch 38, best validation loss (scaled): 0.3145
Iteration 1: Params: {'learning_rate': 0.005, 'batch_size': 32, 'num_epochs': 200, 'hidden1': 256, 'hidden2': 256, 'dropout': 0.5} -> Scaled Val Loss: 0.3145 | Unscaled Val MSE: 3058892544.00, R²: 0.5776
Early stopping at epoch 42, best validation loss (scaled): 0.3089
Iteration 2: Params: {'learning_rate': 0.01, 'batch_size': 16, 'num_epochs': 200, 'hidden1': 512, 'hidden2': 64, 'dropout': 0.5} -> Scaled Val Loss: 0.3089 | Unscaled Val MSE: 2648803328.00, R²: 0.6342
Early stopping at epoch 24, best validation loss (scaled): 0.3241
Iteration 3: Params: {'learning_rate': 0.001, 'batch_size': 32, 'num_epochs': 100, 'hidden1': 128, 'hidden2': 256, 'dropout': 0.2} -> Scaled Val Loss: 0.3241 | Unscaled Val MSE: 2677862400.00, R²: 0.6302
Early stopping at epoch 25, best validation loss (scaled): 0.3208
Iteration 4: Params: {'learning_rate': 0.005, 'batch_size': 64, 'num_epochs': 1000, 'hidden1': 256, 'hidden2': 128, 'dropout':

param_grid = {
    'learning_rate': [0.0005, 0.001, 0.005, 0.01],
    'batch_size': [16, 32, 64],
    'num_epochs': [100, 200, 300],
    'hidden1': [128, 256, 512],
    'hidden2': [64, 128, 256],
    'hidden3': [32, 64],
    'dropout': [0.2, 0.3, 0.5]
}

import random

# Set the number of random combinations you want to try
n_iter = 20  # adjust as needed

best_val_loss = float('inf')
best_params = None
best_model_state = None
best_unscaled_val_mse = None
best_unscaled_val_r2 = None

for i in range(n_iter):
    # Randomly sample a hyperparameter combination
    hparams = {key: random.choice(values) for key, values in param_grid.items()}
    
    val_loss_scaled, temp_model, val_mse_unscaled, val_r2_unscaled = train_and_evaluate_deep(hparams, device=device, patience=20)
    print(f"Iteration {i+1}, Params: {hparams} -> Scaled Val Loss: {val_loss_scaled:.4f} | Unscaled Val MSE: {val_mse_unscaled:.2f}, R²: {val_r2_unscaled:.4f}")
    
    if val_loss_scaled < best_val_loss:
        best_val_loss = val_loss_scaled
        best_params = hparams
        best_model_state = temp_model.state_dict()
        best_unscaled_val_mse = val_mse_unscaled
        best_unscaled_val_r2 = val_r2_unscaled

print("\nBest hyperparameters found for the deep network:")
print(best_params)
print(f"Best validation loss (scaled): {best_val_loss:.4f}")
print(f"Unscaled Validation MSE: {best_unscaled_val_mse:.2f}")
print(f"Unscaled Validation R²: {best_unscaled_val_r2:.4f}")


## 8. Retrain Final Model on Combined Training + Validation Data

We combine the training and validation sets and retrain the model using the best hyperparameters.

In [29]:
X_trainval_combined = np.vstack([X_train, X_val])
y_trainval_combined = np.concatenate([y_train_scaled, y_val_scaled], axis=0)
X_trainval_tensor = torch.tensor(X_trainval_combined, dtype=torch.float32)
y_trainval_tensor = torch.tensor(y_trainval_combined, dtype=torch.float32)
trainval_dataset = TensorDataset(X_trainval_tensor, y_trainval_tensor)

def train_final_deep_model(hparams, device='cpu'):
    input_dim = X_trainval_tensor.shape[1]
    model = HousePriceNNDeep(input_dim, hidden1=hparams['hidden1'],
                             hidden2=hparams['hidden2'],
                             dropout=hparams['dropout']).to(device)
    train_loader = DataLoader(trainval_dataset, batch_size=hparams['batch_size'], shuffle=True)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=hparams['learning_rate'], weight_decay=1e-5)
    for epoch in range(hparams['num_epochs']):
        model.train()
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
    return model

final_deep_model = train_final_deep_model(best_params, device=device)

## 9. Evaluate Final Model on Test Split (from train.csv)

We evaluate the final model on the test split from train.csv by inverse-transforming predictions to obtain the original scale.

In [30]:
final_deep_model.eval()
with torch.no_grad():
    test_preds = final_deep_model(X_test_tensor.to(device))
    test_preds_np = test_preds.cpu().numpy()
    y_test_np = y_test_tensor.cpu().numpy()
    test_preds_unscaled = scaler_y.inverse_transform(test_preds_np)
    y_test_unscaled = scaler_y.inverse_transform(y_test_np)
    final_test_mse = mean_squared_error(y_test_unscaled, test_preds_unscaled)
    final_test_r2  = r2_score(y_test_unscaled, test_preds_unscaled)

print("\nFinal Deep Model Performance on Test Split (from train.csv):")
print("Test MSE:", final_test_mse)
print("Test R²:", final_test_r2)


Final Deep Model Performance on Test Split (from train.csv):
Test MSE: 1098889472.0
Test R²: 0.8491486310958862


## 10. Predict SalePrice on New Test Data (test.csv)

We load `test.csv`, one-hot encode and reindex it to match the training features, scale the features, predict using the final model, inverse-transform predictions to the original scale, and create a submission DataFrame.

In [31]:
# Load the test.csv file (which lacks SalePrice but includes an 'Id' column)
test_df = pd.read_csv('test.csv')
ids = test_df['Id']

# One-hot encode the test data using the same categorical columns as before
test_encoded = pd.get_dummies(test_df, columns=categorical_cols, drop_first=True)
# Reindex to match the training features
test_encoded = test_encoded.reindex(columns=train_columns, fill_value=0)
test_encoded = test_encoded[top_features]  # Select the same top features
test_encoded = test_encoded.astype(float)
X_test_new = scaler_X.transform(test_encoded)

X_test_tensor_new = torch.tensor(X_test_new, dtype=torch.float32)
final_deep_model.eval()
with torch.no_grad():
    test_predictions = final_deep_model(X_test_tensor_new.to(device))
    test_predictions_np = test_predictions.cpu().numpy()

# Inverse transform predictions to get SalePrice in the original scale.
test_predictions_unscaled = scaler_y.inverse_transform(test_predictions_np)

submission_df = pd.DataFrame({
    'Id': ids,
    'SalePrice': test_predictions_unscaled.flatten()
})

print("\nSubmission Preview:")
print(submission_df.head())

# Optionally, export to CSV:
submission_df.to_csv('submissionV3.csv', index=False)


Submission Preview:
     Id      SalePrice
0  1001   69984.562500
1  1002   91345.554688
2  1003  255163.671875
3  1004  170563.968750
4  1005  207829.437500


