# House Price Predictor

In [20]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from torch.utils.data import TensorDataset, DataLoader

In [21]:
# -----------------------------
# 1. Load the dataset
# -----------------------------
# Replace 'house_prices.csv' with the path to your dataset.
data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

print("Training data shape:", data.shape)
print("Test data shape:", test_data.shape)
print("\nTraining data columns:")
print(data.columns)
print("\nTest data columns:")
print(test_data.columns)
print("\nTraining data info:")
data.info()
print("\nTest data info:")
test_data.info()
print("\nTraining data description:")
print(data.describe())

Training data shape: (1000, 81)
Test data shape: (460, 80)

Training data columns:
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'Garage

In [22]:
# -----------------------------
# 2. Data Cleaning
# -----------------------------
# Handle missing values
def handle_missing(df):
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    categorical_columns = df.select_dtypes(exclude=[np.number]).columns
    
    # put the median in place of missing values for numeric columns
    for col in numeric_columns:
        df[col].fillna(df[col].median(), inplace=True)
    
    # put the mode in place of missing values for categorical columns
    for col in categorical_columns:
        df[col].fillna(df[col].mode()[0], inplace=True)
    
    return df

# encode categorical columns
def encode_categorical(df):
    categorical_columns = df.select_dtypes(exclude=[np.number]).columns
    return pd.get_dummies(df, columns=categorical_columns)

# preprocessing
data_clean = handle_missing(data)
test_data_clean = handle_missing(test_data)

data_encoded = encode_categorical(data_clean)
test_data_encoded = encode_categorical(test_data_clean)

# align the columns of the training and test datasets
common_columns = list(set(data_encoded.columns) & set(test_data_encoded.columns))
data_final = data_encoded[common_columns + ['SalePrice']]
test_data_final = test_data_encoded[common_columns]

# Check if the target column is present in the training data
if 'SalePrice' not in data_final.columns:
    raise ValueError("The target column 'SalePrice' is not present in the training data.")

print("Cleaned training data shape:", data_final.shape)
print("Cleaned test data shape:", test_data_final.shape)


Cleaned training data shape: (1000, 252)
Cleaned test data shape: (460, 251)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [23]:
# -----------------------------
# 3. Feature Selection
# -----------------------------
from sklearn.feature_selection import mutual_info_regression

def select_features(X, y, k=20):
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, index=X.columns)
    return mi_scores.nlargest(k).index.tolist()

# Define x,y and k
X = data_final.drop('SalePrice', axis=1)
y = data_final['SalePrice']
k = 20

selected_features = select_features(X, y)

print(f"Selected top {k} features:", list(selected_features))

# update the training and test datasets with the selected features
X_train = data_final[selected_features]
X_test = test_data_final[selected_features]
y_train = data_final['SalePrice'].values.reshape(-1, 1)

print("Training data shape after feature selection:", X_train.shape)
print("Test data shape after feature selection:", X_test.shape)

# check the correlation matrix of the selected features
selected_corr = X_train.corr()
print("\nCorrelation matrix of selected features:")
print(selected_corr)


Selected top 20 features: ['OverallQual', 'GrLivArea', 'GarageCars', 'YearBuilt', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'GarageYrBlt', 'MSSubClass', 'ExterQual_TA', 'FullBath', 'YearRemodAdd', 'GarageFinish_Unf', 'KitchenQual_TA', 'TotRmsAbvGrd', 'ExterQual_Gd', 'LotFrontage', '2ndFlrSF', 'Foundation_PConc', 'Fireplaces']
Training data shape after feature selection: (1000, 20)
Test data shape after feature selection: (460, 20)

Correlation matrix of selected features:
                  OverallQual  GrLivArea  GarageCars  YearBuilt  GarageArea  \
OverallQual          1.000000   0.610512    0.638169   0.568318    0.592268   
GrLivArea            0.610512   1.000000    0.509450   0.226229    0.485337   
GarageCars           0.638169   0.509450    1.000000   0.542108    0.884644   
YearBuilt            0.568318   0.226229    0.542108   1.000000    0.493935   
GarageArea           0.592268   0.485337    0.884644   0.493935    1.000000   
TotalBsmtSF          0.539865   0.432726    0.4483

In [24]:
# -----------------------------
# 4. Data Preprocessing
# -----------------------------

# set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Split the training data
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

y_train = np.log1p(y_train)
y_val = np.log1p(y_val)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors and move to GPU
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1).to(device)
X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).reshape(-1, 1).to(device)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)

# Create datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

print("Training data shape:", X_train_tensor.shape)
print("Validation data shape:", X_val_tensor.shape)
print("Test data shape:", X_test_tensor.shape)


Training data shape: torch.Size([800, 20])
Validation data shape: torch.Size([200, 20])
Test data shape: torch.Size([460, 20])


In [25]:
# -----------------------------
# 5. Define the Neural Network Model
# -----------------------------

class HousePriceModel(nn.Module):
    def __init__(self, input_dim):
        super(HousePriceModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.batch_norm1 = nn.BatchNorm1d(128)
        self.batch_norm2 = nn.BatchNorm1d(64)
        self.batch_norm3 = nn.BatchNorm1d(32)
    
    def forward(self, x):
        x = self.relu(self.batch_norm1(self.fc1(x)))
        x = self.dropout(x)
        x = self.relu(self.batch_norm2(self.fc2(x)))
        x = self.dropout(x)
        x = self.relu(self.batch_norm3(self.fc3(x)))
        x = self.dropout(x)
        x = self.fc4(x)
        return x


# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HousePriceModel(input_dim=X_train_tensor.shape[1]).to(device)

print(model)


HousePriceModel(
  (fc1): Linear(in_features=20, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=32, bias=True)
  (fc4): Linear(in_features=32, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.3, inplace=False)
  (batch_norm1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_norm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_norm3): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [26]:
# -----------------------------
# 6. Set Up Loss Function and Optimizer
# -----------------------------
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Loss function
criterion = nn.MSELoss()

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-4)

# Learning rate scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=True)

# Move model to the specified device (GPU if available)
model = model.to(device)




In [None]:
# -----------------------------
# 7. Train the Model
# -----------------------------
from tqdm import tqdm

num_epochs = 1000
best_val_loss = float('inf')
patience = 50
no_improve = 0

for epoch in tqdm(range(num_epochs)):
    # Training phase
    model.train()
    running_loss = 0.0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * batch_X.size(0)
    
    train_loss = running_loss / len(train_dataset)
    
    # Validation phase
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            val_loss += loss.item() * batch_X.size(0)
    
    val_loss /= len(val_dataset)
    
    # Learning rate scheduling
    scheduler.step(val_loss)
    
    # Print progress
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
    
    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model.pth')
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= patience:
            print(f"Early stopping triggered at epoch {epoch+1}")
            break

print("Training completed.")


  1%|          | 15/2000 [00:00<01:16, 25.89it/s]

Epoch [10/2000], Train Loss: 4.0165, Val Loss: 1.0682


  1%|          | 24/2000 [00:00<01:23, 23.78it/s]

Epoch [20/2000], Train Loss: 4.2198, Val Loss: 1.1842


  2%|▏         | 33/2000 [00:01<01:18, 25.09it/s]

Epoch [30/2000], Train Loss: 4.0369, Val Loss: 1.0141


  2%|▏         | 43/2000 [00:01<01:07, 28.99it/s]

Epoch [40/2000], Train Loss: 3.8257, Val Loss: 1.1322


  3%|▎         | 54/2000 [00:02<01:04, 30.11it/s]

Epoch [50/2000], Train Loss: 3.9022, Val Loss: 1.1121


  3%|▎         | 66/2000 [00:02<01:04, 30.07it/s]

Epoch [60/2000], Train Loss: 3.7690, Val Loss: 1.1411


  4%|▎         | 73/2000 [00:02<01:07, 28.45it/s]

Epoch [70/2000], Train Loss: 3.6028, Val Loss: 1.0896


  4%|▍         | 82/2000 [00:03<01:11, 26.86it/s]

Epoch [80/2000], Train Loss: 3.9937, Val Loss: 1.1602


  5%|▍         | 95/2000 [00:03<01:07, 28.23it/s]

Epoch [90/2000], Train Loss: 3.9640, Val Loss: 1.1263


  5%|▌         | 103/2000 [00:03<01:03, 30.02it/s]

Epoch [100/2000], Train Loss: 4.2267, Val Loss: 1.1990


  6%|▌         | 115/2000 [00:04<00:59, 31.45it/s]

Epoch [110/2000], Train Loss: 3.9370, Val Loss: 1.1057


  6%|▌         | 123/2000 [00:04<01:01, 30.32it/s]

Epoch [120/2000], Train Loss: 3.9359, Val Loss: 1.2285


  6%|▋         | 127/2000 [00:04<01:07, 27.58it/s]

Early stopping triggered at epoch 128
Training completed.





In [31]:
# 8. Evaluate the Model and Calculate MSE
# -----------------------------
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

# Generate predictions
model.eval()
with torch.no_grad():
    predictions = model(X_test_tensor)
    predictions = torch.exp(predictions) - 1  # Reverse the log transformation

# Convert predictions to numpy array
predictions_np = predictions.cpu().numpy()

# Create submission DataFrame
submission = pd.DataFrame({
    'ID': test_data['Id'],
    'SALEPRICE': predictions_np.flatten()
})

# Save predictions to CSV
submission.to_csv('predictions.csv', index=False)

print("Predictions saved to 'predictions.csv'")

# Calculate MSE if answers.csv is available
try:
    answers = pd.read_csv('answers.csv')
    mse = mean_squared_error(answers['SALEPRICE'], submission['SALEPRICE'])
    rmse = np.sqrt(mse)
    print(f"Root Mean Squared Error (RMSE): {rmse}")
except FileNotFoundError:
    print("answers.csv not found. RMSE calculation skipped.")


Predictions saved to 'predictions.csv'
answers.csv not found. RMSE calculation skipped.
