# 12 — Mini Project: Prediction Pipeline

## Project brief
Build and evaluate a model to predict house prices from synthetic features.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

rng = np.random.default_rng(123)
n = 500

df = pd.DataFrame({
    "sqft": rng.normal(1600, 500, n).clip(400, 5000),
    "bedrooms": rng.integers(1, 6, n),
    "age": rng.integers(0, 60, n),
    "neighborhood": rng.choice(["A", "B", "C", "D"], n),
})

base = 50000 + df["sqft"] * 180 + df["bedrooms"] * 10000 - df["age"] * 800
neigh_boost = df["neighborhood"].map({"A": 90000, "B": 50000, "C": 20000, "D": 0})
noise = rng.normal(0, 25000, n)

df["price"] = base + neigh_boost + noise

df.head()

In [None]:
X = df.drop(columns=["price"])
y = df["price"]

num_cols = ["sqft", "bedrooms", "age"]
cat_cols = ["neighborhood"]

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer([
    ("num", numeric_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols),
])

model = RandomForestRegressor(n_estimators=200, random_state=42)

pipe = Pipeline([
    ("preprocess", preprocess),
    ("model", model),
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)

print("MAE:", round(mean_absolute_error(y_test, pred), 2))
print("R2:", round(r2_score(y_test, pred), 3))

## Extension ideas
- Try GradientBoostingRegressor or XGBoost
- Add cross-validation
- Inspect feature importances