In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
# --------------------------------
# Load the cleaned housing dataset
# --------------------------------

In [3]:
csv_path = 'Housing_Clean.csv'

In [4]:
df = pd.read_csv(csv_path)

In [5]:
# --------------------------------
# Define  features (X) and target (y)
# Target = Price, Features = all other columns except Price and LogPrice
# --------------------------------

In [6]:
x= df.drop(columns=["Price","Is City","LogPrice"])

In [7]:
y = df["Price"]

In [8]:
# --------------------------------
# Split data into training and testing sets
# 80% training, 20% testing for fair evaluation
# random_state ensures reproducibility
# --------------------------------

In [9]:
X_train, X_test ,y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42) 

In [10]:
# --------------------------------
# Initialize and train Linear Regression model
# Linear Regression is a simple model assuming a linear relationship between features and target
# --------------------------------

In [11]:
lr = LinearRegression()
lr.fit(X_train,y_train)
lr_pred = lr.predict(X_test) # Predict prices on the test set

In [12]:
# --------------------------------
# Initialize and train Random Forest model
# Random Forest is an ensemble method combining multiple decision trees for better accuracy
# --------------------------------

In [13]:
rf = RandomForestRegressor(n_estimators=100,random_state=42)
rf.fit(X_train,y_train)
rf_pred = rf.predict(X_test) # Predict prices on the test set

In [14]:
# --------------------------------
# Function to display performance metrics
# --------------------------------

In [15]:
def print_metrics(name, y, y_predict): # Metrics help us compare model performance:
    r2 = r2_score(y, y_predict) # R² indicates how well the model explains variance
    mae = mean_absolute_error(y, y_predict) # MAE shows average absolute error
    mse = mean_squared_error(y, y_predict) # MSE shows squared error penalizing large mistakes
    rmse = np.sqrt(mse) # RMSE is the square root of MSE, in same units as target
    print(f"Prediction of {name}")
    print(f"R2 : {r2:.3f}")
    print(f"MAE : {mae:,.0f}")
    print(f"MSE : {mse:,.0f}")
    print(f"RMSE : {rmse:,.0f}")
    print("-"*40)

In [16]:
# --------------------------------
# Evaluate both models using the test set
# --------------------------------

In [17]:
print_metrics("Linear Regression",y_test, lr_pred)

Prediction of Linear Regression
R2 : 0.848
MAE : 63,086
MSE : 5,718,940,941
RMSE : 75,624
----------------------------------------


In [18]:
print_metrics("Random Forest",y_test, rf_pred)

Prediction of Random Forest
R2 : 0.859
MAE : 52,524
MSE : 5,283,317,455
RMSE : 72,686
----------------------------------------


In [19]:
# --------------------------------
# Check predictions on three individual test samples
# This is a sanity check to see how predictions compare to actual prices
# --------------------------------

In [20]:
sample_indices = [2,4, 6]

for i in sample_indices:
    x_one = X_test.iloc[[i]]  # select single row as DataFrame
    y_true = y_test.iloc[i]   # actual price
    p_lr = float(lr.predict(x_one)[0])  # Linear Regression prediction
    p_rf = float(rf.predict(x_one)[0])  # Random Forest prediction
    
    print(f"Single-row sanity check (Sample {i}):")
    print(f"  Actual Price: ${y_true:,.0f}")
    print(f"  LR Prediction: ${p_lr:,.0f}")
    print(f"  RF Prediction: ${p_rf:,.0f}")
    print("-"*40)


Single-row sanity check (Sample 2):
  Actual Price: $292,500
  LR Prediction: $188,637
  RF Prediction: $290,899
----------------------------------------
Single-row sanity check (Sample 4):
  Actual Price: $535,300
  LR Prediction: $609,615
  RF Prediction: $538,756
----------------------------------------
Single-row sanity check (Sample 6):
  Actual Price: $367,500
  LR Prediction: $444,366
  RF Prediction: $396,774
----------------------------------------
