In [1]:
# 1. Import Libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [2]:
# 2. Load Dataset
df = pd.read_csv("clean_house_l5_dataset.csv")


In [3]:
print(df.head(10))

   Size_sqft  Bedrooms  Bathrooms  YearBuilt     Price  Location_City  \
0   1.030281 -1.463643   0.088986  -1.279342  812100.0              1   
1  -0.482463 -1.463643   1.347506   1.326476  547000.0              1   
2   0.468877  0.007430  -1.169534  -1.339942  693700.0              1   
3   1.079817  0.742966   1.347506  -0.915740  848300.0              1   
4   0.788954  1.478502  -1.169534   0.962873  806000.0              0   
5   0.881674  0.007430  -1.169534  -1.339942  724400.0              1   
6   1.280500  1.478502   0.088986  -0.612737  715100.0              0   
7  -0.266538  0.742966  -1.169534  -0.733938  547900.0              0   
8  -0.802540 -0.728107   0.088986  -1.339942  516700.0              0   
9  -1.263604  0.742966   1.347506  -1.036940  418600.0              1   

   Location_Rural  Location_Suburb  HouseAge  Rooms_per_1000sqft  \
0               0                0  1.279342           -1.061465   
1               0                0 -1.326476           -0.26

In [4]:
# 3. Features & Target
X = df.drop(columns=["Price", "LogPrice"])  # all features except Price and LogPrice
y = df["Price"]

In [5]:
# 4. Split Data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [6]:
# 5. Helper Function for Evaluation
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    return r2, mae, mse, rmse


In [8]:
# 6. Train Models

# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [9]:
# 7. Evaluate Models
lr_r2, lr_mae, lr_mse, lr_rmse = evaluate_model(lr_model, X_test, y_test)
rf_r2, rf_mae, rf_mse, rf_rmse = evaluate_model(rf_model, X_test, y_test)

print("Linear Regression Performance:")
print(f"  R²   : {lr_r2:.4f}")
print(f"  MAE  : {lr_mae:,.0f}")
print(f"  MSE  : {lr_mse:,.0f}")
print(f"  RMSE : {lr_rmse:,.0f}\n")

print("Random Forest Performance:")
print(f"  R²   : {rf_r2:.4f}")
print(f"  MAE  : {rf_mae:,.0f}")
print(f"  MSE  : {rf_mse:,.0f}")
print(f"  RMSE : {rf_rmse:,.0f}")


Linear Regression Performance:
  R²   : 0.8478
  MAE  : 63,086
  MSE  : 5,718,940,941
  RMSE : 75,624

Random Forest Performance:
  R²   : 0.8594
  MAE  : 52,524
  MSE  : 5,283,317,455
  RMSE : 72,686


In [10]:
# 8. Single-row Sanity Check
i = 5  # pick any index from the test set
row = X_test.iloc[[i]]

actual_price = y_test.iloc[i]
lr_pred = lr_model.predict(row)[0]
rf_pred = rf_model.predict(row)[0]

print("\nSanity Check on Single Row:")
print(f"  Actual Price       : {actual_price:,.0f}")
print(f"  Linear Regression  : {lr_pred:,.0f}")
print(f"  Random Forest      : {rf_pred:,.0f}")



Sanity Check on Single Row:
  Actual Price       : 419,200
  Linear Regression  : 411,139
  Random Forest      : 297,368
