# Lisa's House Price Predictor

In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from torch.utils.data import TensorDataset, DataLoader

In [5]:
# -----------------------------
# 1. Load the dataset
# -----------------------------
# Replace 'house_prices.csv' with the path to your dataset.
data = pd.read_csv('train.csv')
#data

# ADDED
# Preprocess the training data
def preprocess_data(df, is_train=True):
    df = df.fillna(0)

    # One-hot encode categorical features
    categorical_features = df.select_dtypes(include=['object']).columns
    df = pd.get_dummies(df, columns=categorical_features, drop_first=True)
    
    # If training data, save the columns for later use
    if is_train:
        preprocess_data.columns = df.columns
    
    # If test data, ensure it has the same columns as the training data
    else:
        missing_cols = set(preprocess_data.columns) - set(df.columns)
        for col in missing_cols:
            df[col] = 0
        df = df[preprocess_data.columns]
    

    return df

data_df_processed = preprocess_data(data)
data_df_processed

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,False,False,False,False,True,False,False,False,True,False
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,False,False,False,False,True,False,False,False,True,False
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,False,False,False,False,True,False,False,False,True,False
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,False,False,False,False,True,False,False,False,False,False
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,False,False,False,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,50,51.0,4712,4,7,1946,1950,0.0,384,...,False,False,False,False,True,False,False,False,False,False
996,997,20,0.0,10659,5,6,1961,1961,0.0,915,...,False,False,False,False,False,False,False,False,True,False
997,998,20,0.0,11717,6,6,1970,1970,571.0,0,...,False,False,False,False,True,False,False,False,True,False
998,999,30,60.0,9786,3,4,1922,1950,0.0,0,...,False,False,False,False,True,False,False,False,True,False


In [6]:
# -----------------------------
# 2. Data Cleaning
# -----------------------------
# Select only numeric columns that have no missing data.
numeric_cols = data.select_dtypes(include=[np.number]).columns
clean_numeric_cols = [col for col in numeric_cols if data[col].isna().sum() == 0]
data_clean = data[clean_numeric_cols]

# Ensure that the target column 'price' is present.
if 'SalePrice' not in data_clean.columns:
    raise ValueError("The target column 'price' is not present in the complete numeric data.")

data_clean

Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,8450,7,5,2003,2003,706,0,150,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,9600,6,8,1976,1976,978,0,284,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,11250,7,5,2001,2002,486,0,434,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,9550,7,5,1915,1970,216,0,540,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,14260,8,5,2000,2000,655,0,490,...,192,84,0,0,0,0,0,12,2008,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,50,4712,4,7,1946,1950,384,0,363,...,0,57,0,0,63,0,0,8,2006,121600
996,997,20,10659,5,6,1961,1961,915,0,135,...,0,319,0,0,0,0,0,1,2006,136500
997,998,20,11717,6,6,1970,1970,0,0,1442,...,371,0,0,0,0,0,0,2,2009,185000
998,999,30,9786,3,4,1922,1950,0,0,1007,...,0,100,48,0,0,0,0,5,2006,91000


In [7]:
# VIEWING THE CLEAN DATASET

# Assuming data is your original DataFrame and clean_numeric_cols is a list of columns you want to keep
data_clean = data[clean_numeric_cols]

# Export data_clean to a CSV file
data_clean.to_csv('data_clean.csv', index=False)

In [8]:
# -----------------------------
# 1-2. Confirming Data Cleaning
# -----------------------------
# Print the number of columns from the dataset
print("Number of original columns:", data.shape[1])

# Print the number of columns after cleaning
print("Number of columns after clean:", data_clean.shape[1])

Number of original columns: 81
Number of columns after clean: 35


In [9]:
# -----------------------------
# 3. Feature Selection
# -----------------------------
# Compute the correlation matrix using only the cleaned numeric data.
corr_matrix = data_clean.corr()

#see what the correlation matrix looks like
#print(corr_matrix)

# Compute absolute correlations of features with the target and drop the target itself.
target_corr = corr_matrix['SalePrice'].drop('SalePrice').abs().sort_values(ascending=False)

#view the full list of correlations
#print(target_corr)

# Select only the top 4 features with the highest correlation with 'SalesPrice'
top4_features = target_corr.head(4).index
print("Selected top 4 features:", list(top4_features))

# Define input features (X) and target variable (y).
X = data_clean[top4_features].values
y = data_clean['SalePrice'].values.reshape(-1, 1)

# Confirm size of x (variables with highest correlation) and y (target variable = SalePrice)
print("X shape:", X.shape)
print("y shape:", y.shape)

# -----------------------------
# NOT SURE ABOUT THIS # Prints out new data_clean
#print(data_clean)

Selected top 4 features: ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea']
X shape: (1000, 4)
y shape: (1000, 1)


In [10]:
# -----------------------------
# 4. Data Preprocessing
# -----------------------------
# Split the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features to improve training stability.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Confirming sizes of training and testing sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (800, 4)
X_test shape: (200, 4)
y_train shape: (800, 1)
y_test shape: (200, 1)


In [11]:
# Convert the numpy arrays to PyTorch tensors.
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

# WHAT IS THE PURPOSE OF THE BELOW CODE?
# DataLoader is used to load the data in batches. It provides an iterable over the given dataset.
# train_loader is created with a batch size of 32 and shuffling enabled, meaning the data will be randomly shuffled at each epoch.

# Create a TensorDataset and DataLoader for batch processing.
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [12]:
# -----------------------------
# 5. Define the Neural Network Model
# -----------------------------

#HousePriceModel is a subclass of nn.Module, which is the base class for all neural network modules in PyTorch.

class HousePriceModel(nn.Module):

    #The __init__ method initializes the layers of the network:
    def __init__(self, input_dim):
    
        super(HousePriceModel, self).__init__()
        
        #self.fc1 is a fully connected layer that takes input_dim features and outputs 64 features.
        self.fc1 = nn.Linear(input_dim, 64)  
        # self.fc2 is a fully connected layer that takes 64 features and outputs 32 features.
        self.fc2 = nn.Linear(64, 32) 
        # self.fc3 is a fully connected layer that takes 32 features and outputs 1 feature (the predicted house price).
        # Output layer for regression
        self.fc3 = nn.Linear(32, 1)  
        self.relu = nn.ReLU()
    
    #The forward method defines the forward pass of the network:
    def forward(self, x):

        #The input x is passed through self.fc1 followed by the ReLU activation, then self.fc2 followed by the ReLU activation, and finally self.fc3.
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HousePriceModel(input_dim=X_train.shape[1]).to(device)
print(model)

HousePriceModel(
  (fc1): Linear(in_features=4, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=1, bias=True)
  (relu): ReLU()
)


In [13]:
# -----------------------------
# 6. Set Up Loss Function and Optimizer
# -----------------------------

# nn.MSELoss() is a mean squared error (MSE) loss function.
# MSE loss is commonly used for regression tasks. It calculates the average squared difference between the 
# predicted values and the actual target values. The goal during training is to minimize this loss.
criterion = nn.MSELoss()

# optim.Adam is an implementation of the Adam optimization algorithm.
# Adam is an adaptive learning rate optimization algorithm that is commonly used for training deep learning models.
# lr=0.001 sets the learning rate for the optimizer. 
# The learning rate controls how much the model's parameters are adjusted with respect to the loss 
# gradient during each iteration of training.
optimizer = optim.Adam(model.parameters(), lr=0.001)

# print to view the criterion and optimizer
# criterion
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)

In [14]:
# -----------------------------
# 7. Train the Model
# -----------------------------

# num_epochs is set to 1000, meaning the training loop will run for 1000 iterations.
num_epochs = 1000


# This is the training loop that iterates over the dataset for the specified number of epochs.
for epoch in range(num_epochs):

    # The model is set to training mode using model.train() before the loop starts.
    model.train()
    
    # The running_loss variable is used to keep track of the total loss during training.
    running_loss = 0.0

    # This loop iterates over the batches of data provided by train_loader.
    # batch_X and batch_y are moved to the specified device (GPU or CPU).
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_X)

        # The loss is calculated using the criterion (MSELoss) between the model's predictions outputs and the actual targets batch_y.
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * batch_X.size(0)

    epoch_loss = running_loss / len(train_dataset)

    # The loss is printed every 10 epochs to monitor the training progress.
    # At the end of each epoch, the average loss is computed and printed.

    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")

Epoch [10/1000], Loss: 40970934681.6000
Epoch [20/1000], Loss: 40127305154.5600
Epoch [30/1000], Loss: 37826827632.6400
Epoch [40/1000], Loss: 33839503605.7600
Epoch [50/1000], Loss: 28412521103.3600
Epoch [60/1000], Loss: 22239202549.7600
Epoch [70/1000], Loss: 16287587655.6800
Epoch [80/1000], Loss: 11455863603.2000
Epoch [90/1000], Loss: 8235348357.1200
Epoch [100/1000], Loss: 6470477086.7200
Epoch [110/1000], Loss: 5601267927.0400
Epoch [120/1000], Loss: 5104927221.7600
Epoch [130/1000], Loss: 4723616552.9600
Epoch [140/1000], Loss: 4373977047.0400
Epoch [150/1000], Loss: 4038087946.2400
Epoch [160/1000], Loss: 3716831544.3200
Epoch [170/1000], Loss: 3411028848.6400
Epoch [180/1000], Loss: 3123525877.7600
Epoch [190/1000], Loss: 2858182937.6000
Epoch [200/1000], Loss: 2616967321.6000
Epoch [210/1000], Loss: 2403102141.4400
Epoch [220/1000], Loss: 2220025571.8400
Epoch [230/1000], Loss: 2067811061.7600
Epoch [240/1000], Loss: 1944465630.7200
Epoch [250/1000], Loss: 1849459696.6400
E

In [15]:
# 8. Evaluate the Model
# -----------------------------

# model.eval() sets the model to evaluation mode. This is necessary because some layers 
# (like dropout and batch normalization) behave differently during training and evaluation.
model.eval()

# with torch.no_grad(): disables gradient calculation. This is useful for inference because 
# it reduces memory consumption and speeds up computations.
with torch.no_grad():
    predictions = model(X_test_tensor.to(device))
    test_loss = criterion(predictions, y_test_tensor.to(device)).item()
    print("Test Mean Squared Error:", test_loss)

# Optionally, to evaluate using scikit-learn's MSE:
predictions_np = predictions.cpu().numpy()
mse = mean_squared_error(y_test, predictions_np)
print("Test MSE (scikit-learn):", mse)
#Test Mean Squared Error: 935741376.0


Test Mean Squared Error: 926390656.0
Test MSE (scikit-learn): 926390656.0


In [16]:
# Load the test data
test_df = pd.read_csv('test.csv')
test_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1001,20,RL,74.0,10206,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,7,2009,WD,Normal
1,1002,30,RL,60.0,5400,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,1,2007,WD,Abnorml
2,1003,20,RL,75.0,11957,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,7,2008,WD,Normal
3,1004,90,RL,,11500,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2007,WD,Normal
4,1005,120,RL,43.0,3182,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2009,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,8,2007,WD,Normal
456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,2,2010,WD,Normal
457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal
458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2010,WD,Normal


In [17]:
# Print the shape of the test dataset
print(f"Shape of test_df: {test_df.shape}")

Shape of test_df: (460, 80)


In [31]:
# Preprocess the test data
test_df_processed = preprocess_data(test_df, is_train=False)

# Ensure the test data has the same columns as the training data
missing_cols = set(data_df_processed.columns) - set(test_df_processed.columns)
for col in missing_cols:
	test_df_processed[col] = 0
test_df_processed = test_df_processed[data_df_processed.columns.drop('SalePrice')]

# Select only the top 4 features with the highest correlation with 'SalePrice'
X_test = test_df_processed[top4_features].values

# Standardize the test data
X_test_scaled = scaler.transform(X_test)

# Convert the test data to a tensor
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)

In [32]:
# Print shapes for debugging
print(f"Shape of test_df: {test_df.shape}")
print(f"Shape of test_df_processed: {test_df_processed.shape}")
print(f"Shape of X_test_tensor: {X_test_tensor.shape}")

Shape of test_df: (460, 80)
Shape of test_df_processed: (460, 246)
Shape of X_test_tensor: torch.Size([460, 4])


In [36]:
# Evaluate the model on the test set
model.eval()
with torch.no_grad():
    predictions = model(X_test_tensor)
    predictions_np = predictions.cpu().numpy()

# Print the shape of the predictions
print(f"Shape of predictions_np: {predictions_np.shape}")

Shape of predictions_np: (460, 1)


In [37]:
# Ensure the number of predictions matches the number of test samples
assert len(predictions_np) == len(test_df), "Number of predictions does not match number of test samples"

# Save the predictions to predictions.csv
predictions_df = pd.DataFrame({
    'Id': test_df['Id'].astype(int),
    'SalePrice': predictions_np.flatten().astype(float)
})

predictions_df.to_csv('predictions.csv', index=False)