# Housing Price Modeling (Session 1)

This notebook intentionally follows a simple, linear workflow to prepare for refactoring into a production script later. It trains a regression model on `data/housing.csv`, evaluates it, saves the artifact, and shows prediction usage.

- Dataset: `data/housing.csv`
- Target: `Price`
- Model: StandardScaler + LinearRegression
- Metrics: RMSE, R²
- Artifacts: `scripts/session_1/housing_linear.joblib`

> Next steps (outside this notebook): move logic into `scripts/session_1/train.py` with CLI args and proper logging.


In [None]:
import os
import logging
from pathlib import Path

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(name)s - %(message)s",
)
logger = logging.getLogger("housing")

# Paths
PROJECT_ROOT = Path("..")
DATA_PATH = PROJECT_ROOT / "data" / "housing.csv"
ARTIFACT_DIR = PROJECT_ROOT / "scripts" / "session_1"
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)
MODEL_PATH = ARTIFACT_DIR / "housing_linear.joblib"

logger.info(f"Data path: {DATA_PATH}")
logger.info(f"Artifact dir: {ARTIFACT_DIR}")


In [None]:
import pandas as pd

logger.info("Loading dataset...")
df = pd.read_csv(DATA_PATH)
logger.info(f"Loaded {len(df)} rows and {len(df.columns)} columns")

df.head()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

logger.info("Preparing features and target...")
# Identify target and basic features from the CSV header
TARGET = "Price"
ALL_COLUMNS = df.columns.tolist()
NUM_FEATURES = [
    "Avg. Area Income",
    "Avg. Area House Age",
    "Avg. Area Number of Rooms",
    "Avg. Area Number of Bedrooms",
    "Area Population",
]
CAT_FEATURES = [
    # 'Address' exists but is high-cardinality; we'll drop it for a simple baseline
]

X = df[NUM_FEATURES]
y = df[TARGET]

logger.info("Splitting train/test...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

logger.info("Building pipeline...")
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), NUM_FEATURES),
        # ("cat", OneHotEncoder(handle_unknown="ignore"), CAT_FEATURES),
    ],
    remainder="drop",
)

model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("regressor", LinearRegression()),
    ]
)

logger.info("Training model...")
model.fit(X_train, y_train)

logger.info("Evaluating model...")
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
logger.info(f"RMSE: {rmse:.2f} | R2: {r2:.4f}")

rmse, r2


In [None]:
import joblib

logger.info(f"Saving model to {MODEL_PATH} ...")
joblib.dump(model, MODEL_PATH)
logger.info("Model saved.")

MODEL_PATH


In [None]:
# Demonstrate predictions using the trained pipeline and after reload
import numpy as np

# Create a small batch from X_test
sample = X_test.iloc[:5]
logger.info("Predicting with in-memory model...")
preds_in_memory = model.predict(sample)

logger.info("Reloading model from disk and predicting...")
loaded = joblib.load(MODEL_PATH)
preds_loaded = loaded.predict(sample)

logger.info("Comparing predictions (should match closely):")
comparison = pd.DataFrame({
    "pred_in_memory": preds_in_memory,
    "pred_loaded": preds_loaded,
})
comparison
