In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# --------------------------------
# 1) Load the cleaned dataset
# --------------------------------
CSV_PATH = "clean_house_l5_dataset.csv"
df = pd.read_csv(CSV_PATH)

In [4]:
# --------------------------------
# 2) Split features (X) and target (y)
# --------------------------------
# We predict "Price". We also drop "LogPrice" from X so we don't leak target info.
X = df.drop(columns=["Price", "LogPrice"])
y = df["Price"]

In [5]:
# --------------------------------
# 3) Train/test split for fair evaluation
# --------------------------------
# Keep 20% of data for testing generalization. random_state for reproducibility.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [6]:
# --------------------------------
# 4) Train Linear Regression
# --------------------------------
# Linear model is simple and interpretable; good baseline.
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

In [7]:
# --------------------------------
# 5) Train Random Forest
# --------------------------------
# Ensemble model captures non-linear relationships; often stronger than linear.
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

In [8]:
# --------------------------------
# 6) Helper to print metrics nicely
# --------------------------------
def print_metrics(name, y_true, y_pred):
    """Print R², MAE, MSE, RMSE for a model's predictions."""
    r2  = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    print(f"\n{name} Performance:")
    print(f"  R²   : {r2:.3f}")          # higher is better (max = 1.0)
    print(f"  MAE  : {mae:,.0f}")        # lower is better (absolute error)
    print(f"  MSE  : {mse:,.0f}")        # lower is better (squared error)
    print(f"  RMSE : {rmse:,.0f}")       # lower is better (same units as Price)

In [10]:
# --------------------------------
# 7) Show results for both models
# --------------------------------
print_metrics("Linear Regression", y_test, lr_pred)
print_metrics("Random Forest",   y_test, rf_pred)

# --------------------------------
# 8) Single-row prediction (sanity check)
# --------------------------------
# Pick one unseen row from X_test and predict both models.
# Use iloc[[i]] (double brackets) to keep it as a DataFrame with column names
i = 5
x_one_df = X_test.iloc[[i]]   # 1-row DataFrame (keeps feature names)
y_true   = y_test.iloc[i]     # scalar

p_lr_one = float(lr.predict(x_one_df)[0])
p_rf_one = float(rf.predict(x_one_df)[0])

print("\nSingle-row sanity check:")
print(f"  Actual Price: ${y_true:,.0f}")
print(f"  LR Pred     : ${p_lr_one:,.0f}")
print(f"  RF Pred     : ${p_rf_one:,.0f}")


Linear Regression Performance:
  R²   : 0.848
  MAE  : 63,086
  MSE  : 5,718,940,941
  RMSE : 75,624

Random Forest Performance:
  R²   : 0.859
  MAE  : 52,524
  MSE  : 5,283,317,455
  RMSE : 72,686

Single-row sanity check:
  Actual Price: $419,200
  LR Pred     : $411,139
  RF Pred     : $297,368


In [12]:
i = 6
x_one_df = X_test.iloc[[i]]   # 1-row DataFrame (keeps feature names)
y_true   = y_test.iloc[i]     # scalar

p_lr_one = float(lr.predict(x_one_df)[0])
p_rf_one = float(rf.predict(x_one_df)[0])

print("\nSingle-row sanity check:")
print(f"  Actual Price: ${y_true:,.0f}")
print(f"  LR Pred     : ${p_lr_one:,.0f}")
print(f"  RF Pred     : ${p_rf_one:,.0f}")


Single-row sanity check:
  Actual Price: $367,500
  LR Pred     : $444,366
  RF Pred     : $396,774


In [13]:
i = 7
x_one_df = X_test.iloc[[i]]   # 1-row DataFrame (keeps feature names)
y_true   = y_test.iloc[i]     # scalar

p_lr_one = float(lr.predict(x_one_df)[0])
p_rf_one = float(rf.predict(x_one_df)[0])

print("\nSingle-row sanity check:")
print(f"  Actual Price: ${y_true:,.0f}")
print(f"  LR Pred     : ${p_lr_one:,.0f}")
print(f"  RF Pred     : ${p_rf_one:,.0f}")


Single-row sanity check:
  Actual Price: $743,700
  LR Pred     : $727,107
  RF Pred     : $724,944
