In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.impute import SimpleImputer

import joblib

ROOT = Path("..").resolve()
DATA_PROC = ROOT / "data" / "train" / "housing_train_processed.csv"  # preferred, if fully numeric
DATA_RAW  = ROOT / "data" / "train" / "housing_train.csv"            # fallback if needed
MODEL_OUT = ROOT / "models" / "random_forest_model.pkl"


In [None]:
# Try processed; fallback to raw if objects remain
try:
    housing = pd.read_csv(DATA_PROC)
    if len(housing.select_dtypes(include="object").columns) > 0:
        raise ValueError("Processed file still has object dtypes.")
except Exception:
    print("[INFO] Using RAW; will one-hot encode + impute here.")
    housing = pd.read_csv(DATA_RAW)

housing.head()


[INFO] Using RAW; will one-hot encode + impute here.


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,72100.0,INLAND
1,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,279600.0,NEAR OCEAN
2,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875,82700.0,INLAND
3,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,112500.0,NEAR OCEAN
4,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,238300.0,<1H OCEAN


In [None]:
y = housing["median_house_value"].copy()
X = housing.drop(columns=["median_house_value"])

# One-hot if needed
if "ocean_proximity" in X.columns and X["ocean_proximity"].dtype == "object":
    X = pd.get_dummies(X, columns=["ocean_proximity"])

# Impute NaNs (trees don’t need scaling)
X = pd.DataFrame(SimpleImputer(strategy="median").fit_transform(X), columns=X.columns)

X.shape, y.shape


((16512, 13), (16512,))

In [7]:
rf = RandomForestRegressor(random_state=42, n_estimators=100)
scores = cross_val_score(rf, X, y, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
rmse = np.sqrt(-scores)
print("Baseline RF CV RMSE  mean:", rmse.mean(), "  std:", rmse.std())


Baseline RF CV RMSE  mean: 49429.197374838055   std: 2121.8163210319776


In [None]:
param_grid = {
    "n_estimators": [100, 200, 400],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 5],
    "max_features": ["sqrt", "log2", None],
    "bootstrap": [True, False],
}

grid = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    scoring="neg_mean_squared_error",
    cv=5,
    n_jobs=-1,
    verbose=0
)
grid.fit(X, y)

best_rmse = np.sqrt(-grid.best_score_)
print("Best params:", grid.best_params_)
print("Best CV RMSE:", best_rmse)


In [None]:
best_rf = RandomForestRegressor(random_state=42, **grid.best_params_)
best_rf.fit(X, y)

scores_best = cross_val_score(best_rf, X, y, scoring="neg_mean_squared_error", cv=10, n_jobs=-1)
rmse_best = np.sqrt(-scores_best)
print("Refit RF 10-fold CV RMSE  mean:", rmse_best.mean(), "  std:", rmse_best.std())


In [None]:
MODEL_OUT.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(best_rf, MODEL_OUT)
print("Saved:", MODEL_OUT)


- Random Forests reduce variance via averaging (Géron Ch. 7).
- No scaling is required; imputation handles NaNs.
- Grid searched depth/samples/features to regularize.
