# Finetune the model

We settled on Random Forest Regressor model. Let us fintune the hyper-parameters if it.

In [1]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint
from sklearn.pipeline import Pipeline
from data_transformer import ca_housing_data_transformer
from ca_housing_data import CAHousingData
import numpy as np

## Data Prep
Read the original housing data in the DataFrame. Delete the target feature column `median_house_value`.
Extract the `numerical_cols` and `categorical_col` names to be used in the Data Preprocessing stage.
1. `numerical_cols` will be used:
    - ZeroToNaNTransformer: Convert '0' to NaN values, to be replaced by median for that column later
    - SimpleImputer: Replace all NaN by median for that column
    - PerHouseholdFeaturesAdder: Add three more attributes per household.
    - Standard scaling of all the numerical columns
2. `categorical_col`: 'ocean_proximity' will be replaced by OHE. In the test data the startegy will be 'infrequent_if_exist'. Replace a unknown category by infrequent one.

In [2]:
# Read saved data
housing_data = CAHousingData()
housing_labels = housing_data.labels()  # Target value to be predicted
housing_features = housing_data.features()  # Features to be used for prediction
housing_feature_names = housing_features.columns.tolist()  # Used to rank the features
numerical_cols = housing_data.numerical_features()
categorical_col = housing_data.categorical_features()
ocean_categories = housing_data.ocean_categories()

In [None]:
housing_features.head(2)

In [None]:
housing_labels.head(2)

In [None]:
", ".join(housing_feature_names)

In [None]:
", ".join(numerical_cols)

In [None]:
categorical_col

In [None]:
ocean_categories

## Helper Methods
These handy methods are used to simplify the code and reuse:
1. `print_results`: Print, best score, params and estimator
2. `print_feature_rank`: Relative feature rank

In [9]:
def print_results(search_results):
    """Print the results of the search."""
    print(f"Best score: {search_results.best_score_:,.2f}")
    print(f"Best parameters: {search_results.best_params_}")
    print(f"Best estimator: {search_results.best_estimator_}")
    cv_results = search_results.cv_results_
    for mean_score, params in zip(cv_results["mean_test_score"], cv_results["params"]):
        print(np.sqrt(-mean_score), params)

In [10]:
def print_feature_rank(feature_importances, feature_names):
    """Print the feature rank.

    """
    feature_rank = sorted(
        zip(feature_importances, feature_names), reverse=True)
    print("Feature rank:")
    for rank, (importance, feature) in enumerate(feature_rank):
        print(f"  {rank + 1:02d}. {feature}: {importance:,.3f}")

In [11]:
def print_best_params(best_params):
    """Print the best parameters of the model."""
    for param, value in best_params.items():
        print(f"{param}: {value}")

## Random Forest Grid Search

In [12]:
data_processor = ca_housing_data_transformer(
    numerical_cols, categorical_col, ocean_categories)

__Remark:__ Note that the scikit-learn parameter has a specific grid naming convention.
 - `step_name__parameter_name`: Use double underscores `(__)` to separate step name from parameter name
 - 
 - Example `preprocessor__num_pipeline__imputer__strategy`:
    - preprocessor: Name of the ColumnTransformer step
    - num_pipeline: Name of the numerical pipeline
    - imputer: Name of the SimpleImputer step
    - strategy: The actual parameter name


In [13]:
# Run the Random Forest on all the data.
rf_cv_pipeline = Pipeline([
    ('preprocessor', data_processor),
    ('model', RandomForestRegressor(random_state=282))
])
# For GridSearchCV
param_grid = [
    # try 3 x 4 = 12 combinations of hyperparameters
    # (3, 2), (3, 4), (3, 6), (3, 8), ...
    {
        'model__n_estimators': [3, 10, 30],
        'model__max_features': [2, 4, 6, 8],
        'preprocessor__num_pipeline__imputer__strategy': ['mean', 'median']
    },
    # Then try 2 x 3 = 6 combinations with bootstrap set to False
    # (3, 2), (3, 3), (3, 4), (10, 2), (10, 3), (10, 4)
    {
        'model__bootstrap': [False],
        'model__n_estimators': [3, 10],
        'model__max_features': [2, 3, 4]
    }
]
# 5 folds, for each combination of hyperparameters, 5 x 12 = 60 models
grid_search = GridSearchCV(
    rf_cv_pipeline, param_grid, cv=5,
    scoring="neg_mean_squared_error",
    return_train_score=True
)
grid_search = grid_search.fit(housing_features, housing_labels)

In [None]:
# Feature importances is an attribute of the RandomForestRegressor
best_model = grid_search.best_estimator_
# Access feature importances through the model step
feature_importances = best_model.named_steps['model'].feature_importances_
print_feature_rank(feature_importances, housing_feature_names)

In [None]:
# Get best parameters
print_best_params(grid_search.best_params_)

## Random Forest Random Search

In [16]:
param_distribs = {
    # Model parameters
    'model__n_estimators': randint(low=100, high=500),  # Increased range
    'model__max_features': randint(low=1, high=8),
    'model__max_depth': randint(low=5, high=30),
    'model__min_samples_split': randint(low=2, high=20),
    'model__min_samples_leaf': randint(low=1, high=10),
    'model__bootstrap': [True, False],

    # Preprocessing parameters
    'preprocessor__num_pipeline__imputer__strategy': ['mean', 'median']
}
random_search = RandomizedSearchCV(rf_cv_pipeline,
                                   param_distributions=param_distribs,
                                   n_iter=10, cv=5,
                                   scoring='neg_mean_squared_error',
                                   random_state=282)
random_search = random_search.fit(housing_features, housing_labels)

In [None]:
best_model = random_search.best_estimator_
feature_importances = best_model.named_steps['model'].feature_importances_
print_feature_rank(feature_importances, housing_feature_names)

In [None]:
print_best_params(random_search.best_params_)

# Save Model

## Model Confidence

Compute the confidence range. 
We are 95% confident that the model true RMSE of the model is between these two numbers.

In [None]:
# Save everything (preferred for analysis and reproducibility)
# The search.predict(X_test) will run on the
#   search.best_estimator_.predict(X_test)

import joblib
joblib.dump(random_search, "../../data/random_forest_regressor.pkl")