In [33]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [34]:
CSV_PATH = "../datasets/clean_house_l5_dataset.csv"
df = pd.read_csv(CSV_PATH)

print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())

Dataset shape: (99, 13)
Columns: ['Size_sqft', 'Bedrooms', 'Bathrooms', 'YearBuilt', 'Price', 'Location_City', 'Location_Rural', 'Location_Suburb', 'HouseAge', 'Rooms_per_1000sqft', 'Size_per_Bedroom', 'Is_City', 'LogPrice']


In [35]:
drop_cols = ["Price"]
if "LogPrice" in df.columns:
    drop_cols.append("LogPrice")

X = df.drop(columns=drop_cols)
y = df["Price"]

In [36]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [37]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

In [38]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

In [39]:
def print_metrics(name, y_true, y_pred):
    r2  = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    print(f"\n{name} Performance:")
    print(f"  R²   : {r2:.2f}")     
    print(f"  MAE  : {mae:,.0f}")
    # Eng MSE madan soo dawicin Expected Output marka nasinaysay
    # print(f"  MSE  : {mse:,.0f}")
    print(f"  RMSE : {rmse:,.0f}")

In [40]:
print_metrics("Linear Regression", y_test, lr_pred)
print_metrics("Random Forest",   y_test, rf_pred)


Linear Regression Performance:
  R²   : 0.85
  MAE  : 63,086
  RMSE : 75,624

Random Forest Performance:
  R²   : 0.86
  MAE  : 52,524
  RMSE : 72,686


In [41]:
i = 3
x_one_df = X_test.iloc[[i]]
y_true   = y_test.iloc[i]

In [42]:
p_lr_one = float(lr.predict(x_one_df)[0])
p_rf_one = float(rf.predict(x_one_df)[0])

In [43]:
print("\nSingle-row sanity check:")
print(f"  Actual Price: ${y_true:,.0f}")
print(f"  LR Pred     : ${p_lr_one:,.0f}")
print(f"  RF Pred     : ${p_rf_one:,.0f}")


Single-row sanity check:
  Actual Price: $554,800
  LR Pred     : $594,041
  RF Pred     : $557,028


In [44]:
i = 6
x_one_df = X_test.iloc[[i]]
y_true   = y_test.iloc[i]

p_lr_one = float(lr.predict(x_one_df)[0])
p_rf_one = float(rf.predict(x_one_df)[0])

In [45]:
print("\nSingle-row sanity check:")
print(f"  Actual Price: ${y_true:,.0f}")
print(f"  LR Pred     : ${p_lr_one:,.0f}")
print(f"  RF Pred     : ${p_rf_one:,.0f}")


Single-row sanity check:
  Actual Price: $367,500
  LR Pred     : $444,366
  RF Pred     : $396,774


In [46]:
i = 9
x_one_df = X_test.iloc[[i]]
y_true   = y_test.iloc[i]

p_lr_one = float(lr.predict(x_one_df)[0])
p_rf_one = float(rf.predict(x_one_df)[0])

In [47]:
print("\nSingle-row sanity check:")
print(f"  Actual Price: ${y_true:,.0f}")
print(f"  LR Pred     : ${p_lr_one:,.0f}")
print(f"  RF Pred     : ${p_rf_one:,.0f}")


Single-row sanity check:
  Actual Price: $812,100
  LR Pred     : $825,316
  RF Pred     : $832,261


In [48]:
i = 12
x_one_df = X_test.iloc[[i]]
y_true   = y_test.iloc[i]

p_lr_one = float(lr.predict(x_one_df)[0])
p_rf_one = float(rf.predict(x_one_df)[0])

In [49]:
print("\nSingle-row sanity check:")
print(f"  Actual Price: ${y_true:,.0f}")
print(f"  LR Pred     : ${p_lr_one:,.0f}")
print(f"  RF Pred     : ${p_rf_one:,.0f}")


Single-row sanity check:
  Actual Price: $345,900
  LR Pred     : $420,031
  RF Pred     : $393,837
