# Finetune the model

We settled on Random Forest Regressor model. Let us fintune the hyper-parameters if it.

In [1]:
# Read the data
import pandas as pd
import numpy as np

housing_features = pd.read_parquet("../../data/housing-geron-processed.parquet")
housing_labels = np.load("../../data/median_house_value_labels.npy")

In [2]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint
from ca_housing_pipeline import create_transformer
import pandas as pd

# For GridSearchCV
param_grid = [
    # try 3 x 4 = 12 combinations of hyperparameters
    # (3, 2), (3, 4), (3, 6), (3, 8), ...
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # Then try 2 x 3 = 6 combinations with bootstrap set to False
    # (3, 2), (3, 3), (3, 4), (10, 2), (10, 3), (10, 4)
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}
]

# For RandomizedSearchCV
param_distribs = {
    'n_estimators': randint(low=1, high=200),
    'max_features': randint(low=1, high=8),
}

In [3]:
def print_results(search_results):
    """Print the results of the search."""
    print(f"Best score: {search_results.best_score_:,.2f}")
    print(f"Best parameters: {search_results.best_params_}")
    print(f"Best estimator: {search_results.best_estimator_}")
    cv_results = search_results.cv_results_
    for mean_score, params in zip(cv_results["mean_test_score"], cv_results["params"]):
        print(np.sqrt(-mean_score), params)

In [4]:
def print_feature_rank(feature_importances):
    """Print the feature rank."""
    feature_rank = sorted(
        zip(feature_importances, housing_features.columns), reverse=True)
    print("Feature rank:")
    for rank, (importance, feature) in enumerate(feature_rank):
        print(f"  {rank + 1:02d}. {feature}: {importance:,.3f}")

In [13]:
# Read saved data
df = pd.read_csv("../../data/housing-geron.csv")
df.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY


In [None]:
numerical_cols = df.drop(columns=['ocean_proximity']).columns.tolist()
categorical_col = ['ocean_proximity']
ocean_categories = df['ocean_proximity'].unique().tolist()
output_labels = df["median_house_value"]


In [15]:
data = pd.read_parquet("../../data/housing-geron-processed.parquet")
data.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,population_per_household,bedrooms_per_room,NEAR BAY,<1H OCEAN,INLAND,NEAR OCEAN,ISLAND
0,-1.327835,1.052548,0.982143,-0.804819,-0.972476,-0.974429,-0.977033,2.344766,0.628559,-0.049597,-1.029988,1.0,0.0,0.0,0.0,0.0
1,-1.322844,1.043185,-0.607019,2.04589,1.357143,0.861439,1.669961,2.332238,0.327041,-0.092512,-0.888897,1.0,0.0,0.0,0.0,0.0


## Random Forest Grid Search

In [None]:
full_pipeline = create_transformer()

In [5]:
forest_reg_model = RandomForestRegressor(random_state=282)
# Run the Random Forest on all the data.
rf_cv_pipeline = Pipeline([
    ('preprocessing', full_pipeline),
    ('model', RandomForestRegressor())
])

)
# 5 folds, for each combination of hyperparameters, 5 x 12 = 60 models
grid_search = GridSearchCV(
    forest_reg_model, param_grid, cv=5,
    scoring="neg_mean_squared_error", return_train_score=True
)
grid_search = grid_search.fit(housing_features, housing_labels)

In [6]:
print_results(grid_search)

Best score: -4,650,221,700.71
Best parameters: {'max_features': 6, 'n_estimators': 30}
Best estimator: RandomForestRegressor(max_features=6, n_estimators=30, random_state=282)
81594.60307904065 {'max_features': 2, 'n_estimators': 3}
72669.71704093461 {'max_features': 2, 'n_estimators': 10}
70440.82698055601 {'max_features': 2, 'n_estimators': 30}
84191.63776255821 {'max_features': 4, 'n_estimators': 3}
72155.19308475297 {'max_features': 4, 'n_estimators': 10}
70123.78634884022 {'max_features': 4, 'n_estimators': 30}
77614.81704160845 {'max_features': 6, 'n_estimators': 3}
71616.64050185456 {'max_features': 6, 'n_estimators': 10}
68192.53405403158 {'max_features': 6, 'n_estimators': 30}
78149.82646783875 {'max_features': 8, 'n_estimators': 3}
70415.53669365369 {'max_features': 8, 'n_estimators': 10}
68778.65033802336 {'max_features': 8, 'n_estimators': 30}
82449.09102279652 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
71915.01700679847 {'bootstrap': False, 'max_features': 

In [7]:
print_feature_rank(grid_search.best_estimator_.feature_importances_)

Feature rank:
  01. median_income: 0.289
  02. INLAND: 0.151
  03. population_per_household: 0.108
  04. bedrooms_per_room: 0.083
  05. rooms_per_household: 0.083
  06. longitude: 0.079
  07. latitude: 0.077
  08. housing_median_age: 0.041
  09. total_rooms: 0.017
  10. population: 0.017
  11. total_bedrooms: 0.017
  12. households: 0.015
  13. <1H OCEAN: 0.010
  14. NEAR OCEAN: 0.008
  15. NEAR BAY: 0.005
  16. ISLAND: 0.000


## Random Forest Random Search

In [8]:
forest_model_2 = RandomForestRegressor(random_state=282)
random_search = RandomizedSearchCV(forest_model_2,
                                   param_distributions=param_distribs,
                                   n_iter=10, cv=5,
                                   scoring='neg_mean_squared_error',
                                   random_state=282)
random_search = random_search.fit(housing_features, housing_labels)

In [9]:
print_results(random_search)

Best score: -4,580,667,183.15
Best parameters: {'max_features': 6, 'n_estimators': 74}
Best estimator: RandomForestRegressor(max_features=6, n_estimators=74, random_state=282)
68505.33574773646 {'max_features': 2, 'n_estimators': 179}
67680.62635017048 {'max_features': 6, 'n_estimators': 74}
68236.36325050067 {'max_features': 6, 'n_estimators': 38}
70582.21609073247 {'max_features': 1, 'n_estimators': 118}
68309.45470238202 {'max_features': 7, 'n_estimators': 81}
68056.83108845436 {'max_features': 4, 'n_estimators': 171}
68475.65900155467 {'max_features': 2, 'n_estimators': 175}
67909.34272356473 {'max_features': 5, 'n_estimators': 85}
70585.24316035214 {'max_features': 1, 'n_estimators': 154}
67749.7562581322 {'max_features': 5, 'n_estimators': 145}


In [10]:
print_feature_rank(random_search.best_estimator_.feature_importances_)

Feature rank:
  01. median_income: 0.306
  02. INLAND: 0.156
  03. population_per_household: 0.108
  04. bedrooms_per_room: 0.083
  05. longitude: 0.079
  06. latitude: 0.074
  07. rooms_per_household: 0.067
  08. housing_median_age: 0.042
  09. total_rooms: 0.017
  10. population: 0.017
  11. total_bedrooms: 0.016
  12. households: 0.015
  13. <1H OCEAN: 0.010
  14. NEAR OCEAN: 0.006
  15. NEAR BAY: 0.004
  16. ISLAND: 0.000


# Save Model

## Model Confidence

Compute the confidence range. 
We are 95% confident that the model true RMSE of the model is between these two numbers.

In [None]:
from ca_housing_pipeline import create_transformer

# Get original data and prep it for the model
df = pd.read_parquet("../../data/housing-geron.parquet")
income_categories_col = df['income_cat']
median_house_value_labels = df['median_house_value']  # Series
df = df.drop(columns=['income_cat', 'median_house_value'])

numerical_cols = df.drop(columns=['ocean_proximity']).columns.tolist()
categorical_col = ['ocean_proximity']
ocean_categories = df['ocean_proximity'].unique().tolist()

# Prepare the data for the model
data_prepared_2 = create_transformer(numerical_cols, categorical_col, ocean_categories)
data_prepared_2 = data_prepared_2.fit_transform(df)

In [11]:
final_model = random_search.best_estimator_


In [None]:
# Save the model
import joblib
joblib.dump(final_model, "../../models/random_forest_regressor.pkl")
