In [1]:
import joblib
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV

from data_utils import train_test_split, build_pipeline

In [2]:
SEED = 42

In [3]:
(df_train_X, train_y), (df_test_X, test_y) = train_test_split(seed=SEED)

pipeline = build_pipeline(df_train_X)
pipeline.fit(df_train_X)
train_X = pipeline.transform(df_train_X)
test_X = pipeline.transform(df_test_X)
print(f"train_X.shape = {train_X.shape}")
print(f"test_X.shape = {test_X.shape}")

train_X.shape = (16512, 16)
test_X.shape = (4128, 16)


In [4]:
param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]

In [5]:
model = RandomForestRegressor(random_state=SEED)

grid_search = GridSearchCV(
    model, 
    param_grid, 
    cv=5,
    scoring='neg_mean_squared_error',
    return_train_score=True
)
grid_search.fit(train_X, train_y)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [6]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [7]:
grid_search.best_estimator_

RandomForestRegressor(max_features=8, n_estimators=30, random_state=42)

In [8]:
df_result = pd.DataFrame(grid_search.cv_results_)
df_result['rmse'] = np.sqrt(- df_result['mean_test_score'])
df_result[['params', 'rmse']].sort_values(by='rmse')

Unnamed: 0,params,rmse
11,"{'max_features': 8, 'n_estimators': 30}",49682.273345
8,"{'max_features': 6, 'n_estimators': 30}",50146.511674
5,"{'max_features': 4, 'n_estimators': 30}",50377.404617
17,"{'bootstrap': False, 'max_features': 4, 'n_est...",51009.495669
10,"{'max_features': 8, 'n_estimators': 10}",51711.127884
7,"{'max_features': 6, 'n_estimators': 10}",52006.198735
15,"{'bootstrap': False, 'max_features': 3, 'n_est...",52724.982259
4,"{'max_features': 4, 'n_estimators': 10}",52741.047043
2,"{'max_features': 2, 'n_estimators': 30}",53384.572751
13,"{'bootstrap': False, 'max_features': 2, 'n_est...",54658.176158


In [9]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([7.33442355e-02, 6.29090705e-02, 4.11437985e-02, 1.46726854e-02,
       1.41064835e-02, 1.48742809e-02, 1.42575993e-02, 3.66158981e-01,
       5.64191792e-02, 1.08792957e-01, 5.33510773e-02, 1.03114883e-02,
       1.64780994e-01, 6.02803867e-05, 1.96041560e-03, 2.85647464e-03])

In [10]:
num_features = list(df_train_X.columns)
num_features.remove('ocean_proximity')
extra_features = ['rooms_per_household', 'population_per_household', 'population_per_household']
one_hot_features = list(pipeline.named_transformers_['cat'].categories_[0])
features = num_features + extra_features + one_hot_features
sorted(zip(feature_importances, features), reverse=True)

[(0.36615898061813423, 'median_income'),
 (0.16478099356159054, 'INLAND'),
 (0.10879295677551575, 'population_per_household'),
 (0.07334423551601243, 'longitude'),
 (0.06290907048262032, 'latitude'),
 (0.056419179181954014, 'rooms_per_household'),
 (0.053351077347675815, 'population_per_household'),
 (0.04114379847872964, 'housing_median_age'),
 (0.014874280890402769, 'population'),
 (0.014672685420543239, 'total_rooms'),
 (0.014257599323407808, 'households'),
 (0.014106483453584104, 'total_bedrooms'),
 (0.010311488326303788, '<1H OCEAN'),
 (0.0028564746373201584, 'NEAR OCEAN'),
 (0.0019604155994780706, 'NEAR BAY'),
 (6.0280386727366e-05, 'ISLAND')]

In [11]:
final_model = grid_search.best_estimator_

In [12]:
def evaluate(model, X, y):
    y_hat = model.predict(X)
    mse = mean_squared_error(y, y_hat)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y, y_hat)
    print(f"rmse = {rmse}, mae = {mae}")

In [13]:
evaluate(final_model, test_X, test_y)

rmse = 47730.22690385927, mae = 31797.216198320413


In [14]:
test_y_hat = final_model.predict(test_X)

confidence = 0.95
se = (test_y_hat - test_y) ** 2
np.sqrt(stats.t.interval(confidence, len(se) - 1, loc=se.mean(), scale=stats.sem(se)))

array([45685.10470776, 49691.25001878])

In [15]:
joblib.dump(final_model, 'model.pkl')

['model.pkl']

In [16]:
loaded_model = joblib.load('model.pkl')

In [17]:
evaluate(loaded_model, test_X, test_y)

rmse = 47730.22690385927, mae = 31797.216198320413
