In [3]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
# Load the cleaned dataset
CSV_PATH = 'dataset/clean_house_l5_dataset.csv'
df = pd.read_csv(CSV_PATH)
print(df.shape)
df.head()

(99, 13)


Unnamed: 0,Size_sqft,Bedrooms,Bathrooms,YearBuilt,Price,Location_City,Location_Rural,Location_Suburb,HouseAge,Rooms_per_1000sqft,Size_per_Bedroom,Is_City,LogPrice
0,1.030281,-1.463643,0.088986,-1.279342,812100.0,1,0,0,1.279342,-1.061465,3.123085,1,13.60738
1,-0.482463,-1.463643,1.347506,1.326476,547000.0,1,0,0,-1.326476,-0.265637,1.30952,1,13.212206
2,0.468877,0.00743,-1.169534,-1.339942,693700.0,1,0,0,1.339942,-0.689547,-0.16397,1,13.449796
3,1.079817,0.742966,1.347506,-0.91574,848300.0,1,0,0,0.91574,-0.199111,-0.307614,1,13.650991
4,0.788954,1.478502,-1.169534,0.962873,806000.0,0,0,1,-0.962873,-0.311002,-0.610027,0,13.59984


In [9]:
# Split features (X) and target (y)
X = df.drop(columns=['Price', 'LogPrice'])
y = df['Price']

In [11]:
# Train/test split for fair evaluation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Train shape: (79, 11), Test shape: (20, 11)


In [13]:
# Train Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

In [15]:
# Train Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

In [17]:
# Helper to print metrics nicely
def print_metrics(name, y_true, y_pred):
    """Print R², MAE, MSE, RMSE for a model's predictions."""
    r2  = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    print(f"\n{name} Performance:")
    print(f"  R²   : {r2:.3f}")        
    print(f"  MAE  : {mae:,.0f}")        
    print(f"  MSE  : {mse:,.0f}")
    print(f"  RMSE : {rmse:,.0f}")

In [19]:
# Show results for both models
print_metrics("Linear Regression", y_test, lr_pred)
print_metrics("Random Forest",     y_test, rf_pred)


Linear Regression Performance:
  R²   : 0.848
  MAE  : 63,086
  MSE  : 5,718,940,941
  RMSE : 75,624

Random Forest Performance:
  R²   : 0.859
  MAE  : 52,524
  MSE  : 5,283,317,455
  RMSE : 72,686


In [21]:
# Single-row prediction (sanity check)
i = 2
x_one_df = X_test.iloc[[i]]  # double brackets to keep it as DataFrame
y_true   = y_test.iloc[i]

p_lr_one = float(lr.predict(x_one_df)[0])
p_rf_one = float(rf.predict(x_one_df)[0])

print("\nSingle-row sanity check:")
print(f"  Actual Price: ${y_true:,.0f}")
print(f"  LR Pred     : ${p_lr_one:,.0f}")
print(f"  RF Pred     : ${p_rf_one:,.0f}")


Single-row sanity check:
  Actual Price: $292,500
  LR Pred     : $188,637
  RF Pred     : $290,899
