In [7]:
# Training and export for exactly 9 features
# - Output target is Median House Value in 100k USD (per dataset definition)
# - API will multiply predictions by 100_000 to report USD

from sklearn.datasets import fetch_california_housing
import numpy as np

# 1. Load data and build engineered features
housing = fetch_california_housing(as_frame=True)
df = housing.frame.rename(
    columns={
        "MedInc": "MedInc",
        "HouseAge": "HouseAge",
        "AveRooms": "AveRooms",
        "AveBedrms": "AveBedrms",
        "Population": "Population",
        "AveOccup": "AveOccup",
        "Latitude": "Latitude",
        "Longitude": "Longitude",
    }
)

df["RoomsPerPerson"] = df["AveRooms"] / df["AveOccup"]
df["BedrmRoomRatio"] = df["AveBedrms"] / df["AveRooms"]
df["LogPopulation"] = np.log1p(df["Population"])

# Target (in 100k USD)
y = housing.target

# Exact 9-feature set
features_for_model = [
    "MedInc",
    "HouseAge",
    "AveRooms",
    "AveBedrms",
    "Population",
    "AveOccup",
    "RoomsPerPerson",
    "BedrmRoomRatio",
    "LogPopulation",
]
X = df[features_for_model].copy()

# 2. Split & scale
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. Train baseline model (Linear Regression for v1)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor


model = LinearRegression()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"RMSE (100k USD): {rmse:.3f}")
print(f"R^2: {r2:.3f}")

forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(X_train, y_train)

y_pred_forest = forest_reg.predict(X_test)

rmse_forest = mean_squared_error(y_test, y_pred_forest, squared=False)
r2_forest = r2_score(y_test, y_pred_forest)

print(f"RMSE (100k USD) Forest: {rmse_forest:.3f}")
print(f"R^2 Forest: {r2_forest:.3f}")


# 4. Persist artifacts
import joblib, os, json
os.makedirs("model", exist_ok=True)
joblib.dump(model, "model/housing_model.pkl")
joblib.dump(scaler, "model/scaler.pkl")

with open("model/feature_order.json", "w", encoding="utf-8") as f:
    json.dump(features_for_model, f)

print("Artifacts saved to ./model/")

RMSE (100k USD): 0.736
R^2: 0.586
RMSE (100k USD) Forest: 0.648
R^2 Forest: 0.679
Artifacts saved to ./model/


In [6]:
# === BEST-OF: Ridge (log-target) vs. HistGradientBoosting ===
# - Compares two candidates via CV-RMSE and picks the winner
# - Evaluates on test split
# - Persists API-compatible artifacts:
#   * model/housing_model.pkl -> winner model
#       - Ridge: TransformedTargetRegressor (log/inverse-log) WITHOUT scaler (expects scaled input)
#       - HGB:   plain HistGradientBoostingRegressor (expects raw input)
#   * model/scaler.pkl -> for API compatibility
#       - Ridge: fitted StandardScaler
#       - HGB:   identity FunctionTransformer (so API .transform() is a no-op)
#   * model/feature_order.json -> exact 9 features in the order used by the API
#
# Unit: model predicts in 100k USD; the API multiplies to USD.

import os
import json
import warnings
from typing import List

import joblib
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import HistGradientBoostingRegressor

warnings.filterwarnings("ignore")

# 1) Data and exact 9 features aligned with the API
housing = fetch_california_housing(as_frame=True)
df_raw = housing.frame.copy()

# Build engineered features
df = df_raw[[
    "MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population", "AveOccup",
]].copy()
df["RoomsPerPerson"] = df["AveRooms"] / df["AveOccup"]
df["BedrmRoomRatio"] = df["AveBedrms"] / df["AveRooms"]
df["LogPopulation"] = np.log1p(df["Population"])

features: List[str] = [
    "MedInc",
    "HouseAge",
    "AveRooms",
    "AveBedrms",
    "Population",
    "AveOccup",
    "RoomsPerPerson",
    "BedrmRoomRatio",
    "LogPopulation",
]
X = df[features].copy()
y = housing.target  # in 100k USD

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

# 2) Scorer & CV
neg_rmse = "neg_root_mean_squared_error"
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# 3) Candidate A: Ridge with log-target
# Train-time pipeline: StandardScaler -> TransformedTargetRegressor(Ridge)
# We will persist ONLY the TTR (no scaler) so the API's scaler is the single scaler applied.
ridge_ttr = TransformedTargetRegressor(
    regressor=Ridge(), func=np.log1p, inverse_func=np.expm1
)
ridge_train_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("ttr", ridge_ttr),
])
param_ridge = {"ttr__regressor__alpha": np.logspace(-3, 3, 30)}
rs_ridge = RandomizedSearchCV(
    ridge_train_pipe,
    param_distributions=param_ridge,
    n_iter=20,
    scoring=neg_rmse,
    cv=cv,
    n_jobs=-1,
    random_state=42,
)
rs_ridge.fit(X_train, y_train)

ridge_cv_rmse = -rs_ridge.best_score_
ridge_test_pred = rs_ridge.best_estimator_.predict(X_test)
ridge_test_rmse = mean_squared_error(y_test, ridge_test_pred, squared=False)
ridge_test_r2 = r2_score(y_test, ridge_test_pred)

# 4) Candidate B: HistGradientBoosting (no scaler required)
hgb = HistGradientBoostingRegressor(random_state=42)
param_hgb = {
    "learning_rate": [0.02, 0.05, 0.1],
    "max_depth": [None, 4, 8, 12],
    "max_leaf_nodes": [31, 63, 127],
    "min_samples_leaf": [10, 20, 50, 100],
    "l2_regularization": [0.0, 0.1, 1.0],
}
rs_hgb = RandomizedSearchCV(
    hgb,
    param_distributions=param_hgb,
    n_iter=30,
    scoring=neg_rmse,
    cv=cv,
    n_jobs=-1,
    random_state=42,
)
rs_hgb.fit(X_train, y_train)

hgb_cv_rmse = -rs_hgb.best_score_
hgb_test_pred = rs_hgb.predict(X_test)
hgb_test_rmse = mean_squared_error(y_test, hgb_test_pred, squared=False)
hgb_test_r2 = r2_score(y_test, hgb_test_pred)

# 5) Compare & select winner (by CV-RMSE)
print(f"[CV]  Ridge+logy  RMSE: {ridge_cv_rmse:,.3f}")
print(f"[CV]  HGB         RMSE: {hgb_cv_rmse:,.3f}")
print(f"[TEST]Ridge+logy  RMSE: {ridge_test_rmse:,.3f} | R^2: {ridge_test_r2:.3f}")
print(f"[TEST]HGB         RMSE: {hgb_test_rmse:,.3f} | R^2: {hgb_test_r2:.3f}")

winner_name = "HGB" if hgb_cv_rmse < ridge_cv_rmse else "Ridge+logy"
print(f"\n==> Winner by CV: {winner_name}")

# 6) Persist artifacts (API-compatible)
os.makedirs("model", exist_ok=True)

if winner_name == "Ridge+logy":
    # Persist only the TTR (expects scaled input), and the fitted scaler separately
    best_pipe = rs_ridge.best_estimator_
    model_to_save = best_pipe.named_steps["ttr"]
    scaler_to_save = best_pipe.named_steps["scaler"]
else:
    model_to_save = rs_hgb.best_estimator_
    scaler_to_save = FunctionTransformer(validate=False)
    # Fit once to record feature-in meta; keep validate=False so API .transform() is a no-op
    try:
        scaler_to_save.fit(X_train)
    except Exception:
        pass

joblib.dump(model_to_save, "model/housing_model.pkl")
joblib.dump(scaler_to_save, "model/scaler.pkl")

with open("model/feature_order.json", "w", encoding="utf-8") as f:
    json.dump(features, f)

print("\nSaved: model/housing_model.pkl, model/scaler.pkl, model/feature_order.json")
print("Note: Model outputs are in 100k USD. The API multiplies to USD.")



[CV]  Ridge+logy  RMSE: 1.010
[CV]  HGB         RMSE: 0.632
[TEST]Ridge+logy  RMSE: 0.879 | R^2: 0.411
[TEST]HGB         RMSE: 0.637 | R^2: 0.690

==> Winner by CV: HGB

Saved: model/housing_model.pkl, model/scaler.pkl, model/feature_order.json
Note: Model outputs are in 100k USD. The API multiplies to USD.
