# House Price Prediction Model with Cross-Validation

This notebook demonstrates a modified pipeline for a house price prediction model. Instead of using a separate validation set, we split the data from `train.csv` into a training set and a test set. We then perform hyperparameter tuning via k-fold cross-validation (5-fold) on the training data. Finally, we retrain on the entire training set, evaluate on the held-out test set from `train.csv`, and predict on unseen data from `test.csv`.

In [21]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import random

# -------------------------------
# 1. Load and Prepare the Data
# -------------------------------

# Load the training data
train_df = pd.read_csv('train.csv')

# Define the target variable
target = 'SalePrice'

# One-hot encode categorical columns (original columns dropped automatically)
categorical_cols = train_df.select_dtypes(include=['object', 'category']).columns
data = pd.get_dummies(train_df, columns=categorical_cols, dtype=int, drop_first=True)

# Keep only columns with no missing values
clean_numeric_cols = [col for col in data.columns if data[col].isna().sum() == 0]
data_clean = data[clean_numeric_cols]
print("Clean numeric columns:")
print(data_clean.columns)

# Ensure the target column 'SalePrice' is present.
if target not in data_clean.columns:
    raise ValueError("The target column 'SalePrice' is not present in the complete numeric data.")

# Compute the correlation matrix on the clean numeric data.
corr_matrix = data_clean.corr()

# Compute absolute correlations of features with the target (drop the target itself)
target_corr = corr_matrix[target].drop(target).abs().sort_values(ascending=False)

# Select the top features (adjust the number as needed; here we take top 15)
top_features = target_corr.head(15).index
print("Selected top features:", list(top_features))

# Save the feature column names for reindexing later.
train_columns = data_clean.drop(columns=[target]).columns

# Define input features (X) and target variable (y)
X = data_clean[top_features].values
y = data_clean[target].values.reshape(-1, 1)  # y is a numpy array now

Clean numeric columns:
Index(['Id', 'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       ...
       'SaleType_ConLI', 'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth',
       'SaleType_WD', 'SaleCondition_AdjLand', 'SaleCondition_Alloca',
       'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial'],
      dtype='object', length=229)
Selected top features: ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'ExterQual_TA', 'TotRmsAbvGrd', 'FullBath', 'KitchenQual_TA', 'YearBuilt', 'YearRemodAdd', 'Foundation_PConc', 'Fireplaces', 'BsmtQual_TA']


## 2. Split Data: Only Train and Test (from train.csv)

We split the data into an 80% training set and a 20% test set.

In [22]:
# Split into training and test sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


## 3. Standardize Features and Scale the Target

In [23]:
# Create scaler for features
scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_test  = scaler_X.transform(X_test)

# Create scaler for the target variable
scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train)
y_test_scaled  = scaler_y.transform(y_test)


## 4. Convert Data to PyTorch Tensors

In [24]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_scaled, dtype=torch.float32)
X_test_tensor  = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor  = torch.tensor(y_test_scaled, dtype=torch.float32)

# Create a dataset for training (for final training later)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)


## 5. Define the Deep Neural Network Model

In [25]:
class HousePriceNNDeep(nn.Module):
    def __init__(self, input_dim, hidden1=128, hidden2=64, hidden3=32, dropout=0.5):
        super(HousePriceNNDeep, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, hidden3)
        self.fc4 = nn.Linear(hidden3, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)
        return x


## 6. Define Cross-Validation Training Function

This function performs k-fold cross-validation on the training set using the provided hyperparameters.

In [26]:
def train_and_evaluate_deep_cv(hparams, cv=5, device='cpu', patience=20):
    """
    Performs k-fold cross-validation on the training set with given hyperparameters.
    Returns the average scaled validation loss, unscaled MSE, and R² across folds.
    """
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    val_losses = []
    val_mses = []
    val_r2s = []
    
    for train_index, val_index in kf.split(X_train_tensor):
        # Create fold-specific datasets
        X_train_fold = X_train_tensor[train_index]
        y_train_fold = y_train_tensor[train_index]
        X_val_fold = X_train_tensor[val_index]
        y_val_fold = y_train_tensor[val_index]
        
        train_dataset_fold = TensorDataset(X_train_fold, y_train_fold)
        train_loader_fold = DataLoader(train_dataset_fold, batch_size=hparams['batch_size'], shuffle=True)
        
        # Initialize model for this fold
        model = HousePriceNNDeep(
            input_dim=X_train_tensor.shape[1],
            hidden1=hparams['hidden1'],
            hidden2=hparams['hidden2'],
            hidden3=hparams['hidden3'],
            dropout=hparams['dropout']
        ).to(device)
        
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=hparams['learning_rate'], weight_decay=1e-5)
        
        best_val_loss_fold = float('inf')
        epochs_without_improve = 0
        best_model_state = None
        
        for epoch in range(hparams['num_epochs']):
            model.train()
            running_loss = 0.0
            for batch_X, batch_y in train_loader_fold:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                optimizer.zero_grad()
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()
                running_loss += loss.item() * batch_X.size(0)
            
            # Evaluate on fold's validation set
            model.eval()
            with torch.no_grad():
                val_preds = model(X_val_fold.to(device))
                val_loss = criterion(val_preds, y_val_fold.to(device)).item()
            
            if val_loss < best_val_loss_fold:
                best_val_loss_fold = val_loss
                epochs_without_improve = 0
                best_model_state = model.state_dict()
            else:
                epochs_without_improve += 1
            if epochs_without_improve >= patience:
                break
        
        # Load best model for fold
        model.load_state_dict(best_model_state)
        model.eval()
        with torch.no_grad():
            val_preds = model(X_val_fold.to(device))
        val_preds_np = val_preds.cpu().numpy()
        y_val_np = y_val_fold.cpu().numpy()
        
        # Inverse-transform predictions and true values
        val_preds_unscaled = scaler_y.inverse_transform(val_preds_np)
        y_val_unscaled = scaler_y.inverse_transform(y_val_np)
        mse_fold = mean_squared_error(y_val_unscaled, val_preds_unscaled)
        r2_fold = r2_score(y_val_unscaled, val_preds_unscaled)
        
        val_losses.append(best_val_loss_fold)
        val_mses.append(mse_fold)
        val_r2s.append(r2_fold)
    
    avg_val_loss = np.mean(val_losses)
    avg_val_mse = np.mean(val_mses)
    avg_val_r2 = np.mean(val_r2s)
    return avg_val_loss, avg_val_mse, avg_val_r2


## 7. Hyperparameter Tuning via Random Search with Cross-Validation

We randomly sample hyperparameter configurations and evaluate each using 5-fold cross-validation.

In [27]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

param_grid = {
    'learning_rate': [0.001, 0.01],
    'batch_size': [16, 32],
    'num_epochs': [100, 200, 500, 1000],
    'hidden1': [128, 256],
    'hidden2': [64, 128],
    'hidden3': [32, 64],
    'dropout': [0.2, 0.5]
}

best_val_loss = float('inf')
best_params = None
best_unscaled_val_mse = None
best_unscaled_val_r2 = None

n_iter = 50  # Number of random hyperparameter configurations to try

for i in range(n_iter):
    # Randomly sample one value from each hyperparameter list
    hparams = {
        'learning_rate': random.choice(param_grid['learning_rate']),
        'batch_size': random.choice(param_grid['batch_size']),
        'num_epochs': random.choice(param_grid['num_epochs']),
        'hidden1': random.choice(param_grid['hidden1']),
        'hidden2': random.choice(param_grid['hidden2']),
        'hidden3': random.choice(param_grid['hidden3']),
        'dropout': random.choice(param_grid['dropout'])
    }
    
    avg_val_loss, avg_val_mse, avg_val_r2 = train_and_evaluate_deep_cv(hparams, cv=5, device=device, patience=20)
    
    print(f"Iteration {i+1}/{n_iter} - Params: {hparams} -> Avg Scaled Val Loss: {avg_val_loss:.4f} | Avg Unscaled Val MSE: {avg_val_mse:.2f}, R²: {avg_val_r2:.4f}")
    
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_params = hparams
        best_unscaled_val_mse = avg_val_mse
        best_unscaled_val_r2 = avg_val_r2

print("\nBest hyperparameters found via cross-validation:")
print(best_params)
print(f"Best Avg Validation Loss (scaled): {best_val_loss:.4f}")
print(f"Avg Unscaled Validation MSE: {best_unscaled_val_mse:.2f}")
print(f"Avg Unscaled Validation R²: {best_unscaled_val_r2:.4f}")


Iteration 1/50 - Params: {'learning_rate': 0.001, 'batch_size': 32, 'num_epochs': 1000, 'hidden1': 256, 'hidden2': 64, 'hidden3': 32, 'dropout': 0.2} -> Avg Scaled Val Loss: 0.1436 | Avg Unscaled Val MSE: 1330744716.80, R²: 0.8069
Iteration 2/50 - Params: {'learning_rate': 0.01, 'batch_size': 16, 'num_epochs': 100, 'hidden1': 128, 'hidden2': 128, 'hidden3': 32, 'dropout': 0.2} -> Avg Scaled Val Loss: 0.1616 | Avg Unscaled Val MSE: 1844435635.20, R²: 0.7349
Iteration 3/50 - Params: {'learning_rate': 0.01, 'batch_size': 16, 'num_epochs': 500, 'hidden1': 256, 'hidden2': 64, 'hidden3': 32, 'dropout': 0.2} -> Avg Scaled Val Loss: 0.1532 | Avg Unscaled Val MSE: 2216029388.80, R²: 0.6754
Iteration 4/50 - Params: {'learning_rate': 0.01, 'batch_size': 32, 'num_epochs': 100, 'hidden1': 256, 'hidden2': 64, 'hidden3': 64, 'dropout': 0.5} -> Avg Scaled Val Loss: 0.1707 | Avg Unscaled Val MSE: 2766583040.00, R²: 0.5992
Iteration 5/50 - Params: {'learning_rate': 0.01, 'batch_size': 16, 'num_epochs': 

## 8. Retrain Final Model on the Entire Training Set (from train.csv)

In [28]:
def train_final_deep_model(hparams, device='cpu'):
    input_dim = X_train_tensor.shape[1]
    model = HousePriceNNDeep(
        input_dim,
        hidden1=hparams['hidden1'],
        hidden2=hparams['hidden2'],
        hidden3=hparams['hidden3'],
        dropout=hparams['dropout']
    ).to(device)
    train_loader = DataLoader(train_dataset, batch_size=hparams['batch_size'], shuffle=True)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=hparams['learning_rate'], weight_decay=1e-5)
    
    for epoch in range(hparams['num_epochs']):
        model.train()
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
    return model

final_deep_model = train_final_deep_model(best_params, device=device)


## 9. Evaluate Final Model on Test Split (from train.csv)

We now evaluate the final model on the held-out test set (inverse-transforming predictions to the original scale).

In [29]:
final_deep_model.eval()
with torch.no_grad():
    test_preds = final_deep_model(X_test_tensor.to(device))
    test_preds_np = test_preds.cpu().numpy()
    y_test_np = y_test_tensor.cpu().numpy()
    test_preds_unscaled = scaler_y.inverse_transform(test_preds_np)
    y_test_unscaled = scaler_y.inverse_transform(y_test_np)
    final_test_mse = mean_squared_error(y_test_unscaled, test_preds_unscaled)
    final_test_r2  = r2_score(y_test_unscaled, test_preds_unscaled)

print("\nFinal Deep Model Performance on Test Split (from train.csv):")
print("Test MSE:", final_test_mse)
print("Test R²:", final_test_r2)



Final Deep Model Performance on Test Split (from train.csv):
Test MSE: 794998720.0
Test R²: 0.8096362352371216


## 10. Predict SalePrice on New Unseen Test Data (test.csv) and Create Submission DataFrame

In [30]:
# Load the test.csv file (which lacks SalePrice but includes an 'Id' column)
test_df = pd.read_csv('test.csv')
ids = test_df['Id']

# One-hot encode the test data using the same categorical columns as before
test_encoded = pd.get_dummies(test_df, columns=categorical_cols, drop_first=True)

# Reindex to match the training features
test_encoded = test_encoded.reindex(columns=train_columns, fill_value=0)
test_encoded = test_encoded[top_features]  # Select the same top features
test_encoded = test_encoded.astype(float)
X_test_new = scaler_X.transform(test_encoded)

X_test_tensor_new = torch.tensor(X_test_new, dtype=torch.float32)
final_deep_model.eval()
with torch.no_grad():
    test_predictions = final_deep_model(X_test_tensor_new.to(device))
    test_predictions_np = test_predictions.cpu().numpy()

# Inverse transform predictions to get SalePrice in the original scale.
test_predictions_unscaled = scaler_y.inverse_transform(test_predictions_np)

submission_df = pd.DataFrame({
    'ID': ids.astype(int),
    'SALEPRICE': test_predictions_unscaled.flatten().astype(float)
})

print("\nSubmission Preview:")
print(submission_df.head())

# Optionally, export to CSV:
submission_df.to_csv('predictions3.csv', index=False)



Submission Preview:
     ID      SALEPRICE
0  1001   85357.546875
1  1002   82864.820312
2  1003  269443.562500
3  1004  148358.968750
4  1005  207777.531250


