
# Part A – Practical: House Price Prediction

This notebook trains **Linear Regression** and **Random Forest Regressor** on your cleaned dataset and compares performance.

> **Expected input file:** `clean_house_dataset.csv` in the **same folder** as this notebook.


In [None]:

# 1) Notebook Setup
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

print("Working directory:", os.getcwd())
print("Files in this folder:", os.listdir())


In [None]:

# 2) Load Dataset
CSV_PATH = "clean_house_dataset.csv"  # Put your dataset next to this notebook

if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(
        f"Could not find '{CSV_PATH}'. Move it next to this notebook or edit CSV_PATH to the correct path."
    )

df = pd.read_csv(CSV_PATH)
print("Dataset Shape:", df.shape)
display(df.head())


In [None]:

# 3) Prepare Features & Target
# Target (y) = Price
# Features (X) = all columns except Price and LogPrice

if "Price" not in df.columns:
    raise KeyError("Column 'Price' not found. Ensure your cleaned dataset includes a 'Price' column.")

drop_cols = [c for c in ["Price", "LogPrice"] if c in df.columns]
X_full = df.drop(columns=drop_cols)
y = df["Price"]

# Keep only numeric features for robust Linear Regression
numeric_cols = X_full.select_dtypes(include=[np.number]).columns.tolist()
if len(numeric_cols) < X_full.shape[1]:
    print("NOTE: Dropping non-numeric columns:", [c for c in X_full.columns if c not in numeric_cols])

X = X_full[numeric_cols]

print("Features shape:", X.shape)
print("Target shape:", y.shape)


In [None]:

# 4) Split Data (80/20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training set:", X_train.shape, y_train.shape)
print("Test set    :", X_test.shape, y_test.shape)


In [None]:

# 5) Train Models
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)

print("Models trained.")


In [None]:

# 6) Evaluation Helper
def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    print(f"{name} Performance:")
    print(f"  R²   : {r2:.4f}")
    print(f"  MAE  : {mae:,.0f}")
    print(f"  MSE  : {mse:,.0f}")
    print(f"  RMSE : {rmse:,.0f}")
    print("-" * 40)
    return {"R2": r2, "MAE": mae, "MSE": mse, "RMSE": rmse}


In [None]:

# 7) Evaluate Both Models
lin_metrics = evaluate_model("Linear Regression", lin_reg, X_test, y_test)
rf_metrics  = evaluate_model("Random Forest", rf_reg, X_test, y_test)


In [None]:

# 8) Single-row Sanity Check
i = 5  # You can change this to any index within range
i = min(i, len(X_test) - 1)
row = X_test.iloc[[i]]
actual = y_test.iloc[i]
lin_pred = float(lin_reg.predict(row)[0])
rf_pred  = float(rf_reg.predict(row)[0])

print("Sanity Check (Single Row)")
print(f"  Index           : {i}")
print(f"  Actual Price    : {actual:,.0f}")
print(f"  Linear Predicted: {lin_pred:,.0f}")
print(f"  RF Predicted    : {rf_pred:,.0f}")
