In [3]:
# Training and export for exactly 9 features
# - Output target is Median House Value in 100k USD (per dataset definition)
# - API will multiply predictions by 100_000 to report USD

from sklearn.datasets import fetch_california_housing
import numpy as np

# 1. Load data and build engineered features
housing = fetch_california_housing(as_frame=True)
df = housing.frame.rename(
    columns={
        "MedInc": "MedInc",
        "HouseAge": "HouseAge",
        "AveRooms": "AveRooms",
        "AveBedrms": "AveBedrms",
        "Population": "Population",
        "AveOccup": "AveOccup",
        "Latitude": "Latitude",
        "Longitude": "Longitude",
    }
)

df["RoomsPerPerson"] = df["AveRooms"] / df["AveOccup"]
df["BedrmRoomRatio"] = df["AveBedrms"] / df["AveRooms"]
df["LogPopulation"] = np.log1p(df["Population"])

# Target (in 100k USD)
y = housing.target

# Exact 9-feature set
features_for_model = [
    "MedInc",
    "HouseAge",
    "AveRooms",
    "AveBedrms",
    "Population",
    "AveOccup",
    "RoomsPerPerson",
    "BedrmRoomRatio",
    "LogPopulation",
]
X = df[features_for_model].copy()

# 2. Split & scale
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. Train baseline model (Linear Regression for v1)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

model = LinearRegression()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"RMSE (100k USD): {rmse:.3f}")
print(f"R^2: {r2:.3f}")

# 4. Persist artifacts
import joblib, os, json
os.makedirs("model", exist_ok=True)
joblib.dump(model, "model/housing_model.pkl")
joblib.dump(scaler, "model/scaler.pkl")

with open("model/feature_order.json", "w", encoding="utf-8") as f:
    json.dump(features_for_model, f)

print("Artifacts saved to ./model/")


RMSE (100k USD): 0.736
R^2: 0.586
Artifacts saved to ./model/


