In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [2]:
CSV_PATH = "clean_house_dataset.csv"
df = pd.read_csv(CSV_PATH)


In [11]:
print("----Dataset Snapshot---")
print(df.head())
print(df.info())

----Dataset Snapshot---
   Size_sqft  Bedrooms  Bathrooms  YearBuilt     Price  Location_City  \
0   1.030281 -1.463643   0.088986  -1.279342  812100.0              1   
1  -0.482463 -1.463643   1.347506   1.326476  547000.0              1   
2   0.468877  0.007430  -1.169534  -1.339942  693700.0              1   
3   1.079817  0.742966   1.347506  -0.915740  848300.0              1   
4   0.788954  1.478502  -1.169534   0.962873  806000.0              0   

   Location_Rural  Location_Suburb  HouseAge  Rooms_per_1000sqft  \
0               0                0  1.279342           -1.061465   
1               0                0 -1.326476           -0.265637   
2               0                0  1.339942           -0.689547   
3               0                0  0.915740           -0.199111   
4               0                1 -0.962873           -0.311002   

   Size_per_Bedroom  Is_City   LogPrice  
0          3.123085        1  13.607380  
1          1.309520        1  13.212206  
2 

In [4]:
X = df.drop(columns=["Price", "LogPrice"])
y = df["Price"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [6]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

In [7]:
def print_metrics(name, y_true, y_pred):
    r2  = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    print(f"\n{name} Performance:")
    print(f"  R2   : {r2:.3f}")
    print(f"  MAE  : {mae:,.0f}")
    print(f"  MSE  : {mse:,.0f}")
    print(f"  RMSE : {rmse:,.0f}")


In [8]:
print_metrics("Linear Regression", y_test, lr_pred)
print_metrics("Random Forest", y_test, rf_pred)


Linear Regression Performance:
  R2   : 0.848
  MAE  : 63,086
  MSE  : 5,718,940,941
  RMSE : 75,624

Random Forest Performance:
  R2   : 0.859
  MAE  : 52,524
  MSE  : 5,283,317,455
  RMSE : 72,686


In [10]:
i = 3  # Example row
x_one_df = X_test.iloc[[i]]   # Keep DataFrame structure
y_true   = y_test.iloc[i]

p_lr_one = float(lr.predict(x_one_df)[0])
p_rf_one = float(rf.predict(x_one_df)[0])

print("\n-------Single-row sanity check----------")
print(f"  Actual Price: ${y_true:,.0f}")
print(f"  LR Pred     : ${p_lr_one:,.0f}")
print(f"  RF Pred     : ${p_rf_one:,.0f}")


-------Single-row sanity check----------
  Actual Price: $554,800
  LR Pred     : $594,041
  RF Pred     : $557,028
