In [2]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# Fetch the dataset and load it as a pandas dataframe
housing = fetch_california_housing(as_frame=True)

# Split dataset
X = housing.data
y = housing.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Instantiate the scaler
scaler = StandardScaler()

# Fit the scaler to the TRAINING data and transform it
X_train_scaled = scaler.fit_transform(X_train)

# Transform the TEST data using teh same fitted scaler
X_test_scaled = scaler.transform(X_test)

# Define the grid for hyperparameters to search
param_grid = {
    'n_estimators': [int(x) for x in np.linspace(start=100, stop=1200, num=12)],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [int(x) for x in np.linspace(5, 30, num=6)],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]    
}

# Import and instantiate model on scaled data
rf = RandomForestRegressor(random_state=42)

# Instntiate RandomizedSearchCV Object
rf_random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

print("Starting hyperparameter tuning ... This may take a few minutes.")

# Fit the search to the data
rf_random_search.fit(X_train_scaled, y_train)

print("\n Tuning complete!")

# Print the best parameters
print("\nBest parameters found:")
print(rf_random_search.best_params_)

# Get the best model
best_model = rf_random_search.best_estimator_

# Make predictions and evaluate
y_pred = best_model.predict(X_test_scaled)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\n--- Model Evaluation (Tuned Random Forest) ---")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-Squared Score: {r2:.2f}")

Starting hyperparameter tuning ... This may take a few minutes.
Fitting 3 folds for each of 100 candidates, totalling 300 fits


168 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
68 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\msenk\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\msenk\anaconda3\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "C:\Users\msenk\anaconda3\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
        self._parameter_constraints,


 Tuning complete!

Best parameters found:
{'n_estimators': 600, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 25}

--- Model Evaluation (Tuned Random Forest) ---
Mean Squared Error: 0.24
R-Squared Score: 0.82
