# House Price Predictor

In [136]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from torch.utils.data import TensorDataset, DataLoader

In [137]:
# -----------------------------
# 1. Load the dataset
# -----------------------------
# Replace 'house_prices.csv' with the path to your dataset.
data = pd.read_csv('train.csv')
data


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,50,RL,51.0,4712,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,8,2006,WD,Abnorml,121600
996,997,20,RL,,10659,Pave,,IR1,Lvl,AllPub,...,0,,,,0,1,2006,COD,Normal,136500
997,998,20,RL,,11717,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2009,WD,Normal,185000
998,999,30,RM,60.0,9786,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2006,WD,Normal,91000


In [138]:
# -----------------------------
# 2. Data Cleaning
# -----------------------------
# Select only numeric columns that have no missing data.
numeric_cols = data.select_dtypes(include=[np.number]).columns

#keep only columns with no missing values
clean_numeric_cols = [col for col in numeric_cols if data[col].isna().sum() == 0]

#create a cleared dataset with only numeric columns
data_clean = data[clean_numeric_cols]


# Ensure that the target column 'price' is present.
if 'SalePrice' not in data_clean.columns:
    raise ValueError("The target column 'price' is not present in the complete numeric data.")
print(data_clean.columns)

Index(['Id', 'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
       'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars',
       'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')


In [139]:
# -----------------------------
# 3. Feature Selection
# -----------------------------
# Compute the correlation matrix using only the cleaned numeric data.
corr_matrix = data_clean.corr()

# Compute absolute correlations of features with the target and drop the target itself.
target_corr = corr_matrix['SalePrice'].drop('SalePrice').abs().sort_values(ascending=False)

# Select only the top 4 features with the highest correlation with 'SalesPrice'
top4_features = target_corr.head(4).index
print("Selected top 4 features:", list(top4_features))

# Define input features (X) and target variable (y).
X = data_clean[top4_features].values
y = data_clean['SalePrice'].values.reshape(-1, 1)

Selected top 4 features: ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea']


In [140]:
# -----------------------------
# 4. Data Preprocessing
# -----------------------------
# Split the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features to improve training stability.
scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

# ✅ Standardize SalePrice (y values)
scaler_y = StandardScaler()
y_train = scaler_y.fit_transform(y_train.reshape(-1, 1))
y_test = scaler_y.transform(y_test.reshape(-1, 1))


print(X_train.shape, X_test.shape)

(800, 4) (200, 4)


In [141]:
# Convert the numpy arrays to PyTorch tensors.
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Create a TensorDataset and DataLoader for batch processing.
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

print(X_train_tensor.shape, y_train_tensor.shape, X_test_tensor.shape, y_test_tensor.shape)

torch.Size([800, 4]) torch.Size([800, 1]) torch.Size([200, 4]) torch.Size([200, 1])


In [142]:
# -----------------------------
# 5. Define the Neural Network Model
# -----------------------------
class HousePriceModel(nn.Module):
    def __init__(self, input_dim):
        super(HousePriceModel, self).__init__()
        #Deeper architecture for better learning
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)  # Output layer for regression
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)  # 20% dropout
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        x = self.dropout(x)
        return x

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HousePriceModel(input_dim=X_train.shape[1]).to(device)


In [143]:
# -----------------------------
# 6. Set Up Loss Function and Optimizer
# -----------------------------
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# -----------------------------
# 7. Train the Model
# -----------------------------
num_epochs = 1000
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * batch_X.size(0)
    
    epoch_loss = running_loss / len(train_dataset)
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")

Epoch [10/1000], Loss: 0.6134
Epoch [20/1000], Loss: 0.4106
Epoch [30/1000], Loss: 0.3894
Epoch [40/1000], Loss: 0.3609
Epoch [50/1000], Loss: 0.3652
Epoch [60/1000], Loss: 0.3626
Epoch [70/1000], Loss: 0.3615
Epoch [80/1000], Loss: 0.3448
Epoch [90/1000], Loss: 0.3040
Epoch [100/1000], Loss: 0.3598
Epoch [110/1000], Loss: 0.3837
Epoch [120/1000], Loss: 0.3508
Epoch [130/1000], Loss: 0.3433
Epoch [140/1000], Loss: 0.3998
Epoch [150/1000], Loss: 0.4200
Epoch [160/1000], Loss: 0.3451
Epoch [170/1000], Loss: 0.3504
Epoch [180/1000], Loss: 0.2581
Epoch [190/1000], Loss: 0.3228
Epoch [200/1000], Loss: 0.3801
Epoch [210/1000], Loss: 0.3256
Epoch [220/1000], Loss: 0.3629
Epoch [230/1000], Loss: 0.3382
Epoch [240/1000], Loss: 0.3145
Epoch [250/1000], Loss: 0.3420
Epoch [260/1000], Loss: 0.3250
Epoch [270/1000], Loss: 0.3312
Epoch [280/1000], Loss: 0.3229
Epoch [290/1000], Loss: 0.3254
Epoch [300/1000], Loss: 0.3713
Epoch [310/1000], Loss: 0.3534
Epoch [320/1000], Loss: 0.2903
Epoch [330/1000],

In [145]:
# 8. Evaluate the Model
# -----------------------------
model.eval()
with torch.no_grad():
    predictions = model(X_test_tensor.to(device))
    test_loss = criterion(predictions, y_test_tensor.to(device)).item()
    print("Test Mean Squared Error:", test_loss)

# Optionally, to evaluate using scikit-learn's MSE:
predictions_np = predictions.cpu().numpy()
mse = mean_squared_error(y_test, predictions_np)
print("Test MSE (scikit-learn):", mse)

Test Mean Squared Error: 0.15165792405605316
Test MSE (scikit-learn): 0.15165792121422278


In [146]:
predictions_original = scaler_y.inverse_transform(predictions_np.reshape(-1, 1))
y_test_original = scaler_y.inverse_transform(y_test.reshape(-1, 1))

mse_original = mean_squared_error(y_test_original, predictions_original)
rmse_original = mse_original**0.5
print(f"Test RMSE in Dollars: ${rmse_original:,.2f}")

Test RMSE in Dollars: $32,532.60


In [147]:
# -----------------------------
# 9. Load Test Data and Predict
# -----------------------------

# Load test.csv
test_data = pd.read_csv('test.csv')

# ✅ Ensure `top4_features` is defined
if 'top4_features' not in locals():
    raise ValueError("❌ `top4_features` is not defined. Make sure Step 3 ran correctly!")

# ✅ Ensure all training features exist in test data
for col in top4_features:
    if col not in test_data.columns:
        print(f"⚠ Warning: {col} is missing in test.csv! Filling with 0.")
        test_data[col] = 0  # Fill missing features with 0

# ✅ Select only the top 4 features used during training
X_test = test_data[top4_features].values  # Ensure correct shape

# Use the same scaler fitted on the training data
X_test = scaler_X.transform(X_test)  # This should now work correctly

# Convert to PyTorch tensor
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)

# -----------------------------
# Make Predictions
# -----------------------------
model.eval()
with torch.no_grad():
    test_predictions = model(X_test_tensor)

# Convert predictions back to original scale (if SalePrice was standardized)
test_predictions_np = test_predictions.cpu().numpy().flatten()
test_predictions_original = scaler_y.inverse_transform(test_predictions_np.reshape(-1, 1)).flatten()


In [149]:
# -----------------------------
# 10. Save Predictions to CSV
# -----------------------------
# The submission file should have columns: ID, SALEPRICE
submission = pd.DataFrame({
    "Id": test_data["Id"],  # 'Id' should match the column in your test.csv
    "SalePrice": test_predictions_original
})

submission.to_csv('predictions.csv', index=False)
print("Submission file saved as predictions.csv")

Submission file saved as predictions.csv
