# <center>Class 16: Random Forest and Boosting
</center>

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import os
# from pathlib import Path
import sys
from patsy import dmatrices
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.inspection import PartialDependenceDisplay
from sklearn.inspection import partial_dependence
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error

### Get Data

In [None]:
path = os.path.join(os.pardir, 'data', 'airbnb_london_workfile_adj_book.csv') # this will produce a path with the right syntax for your operating system
path

In [None]:
# DATA IMPORT - FROM FILE
df = pd.read_csv(path)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum().sum()

### EDA

In [None]:
# We focus on normal apartments, n < 8
df = df[df.n_accommodates < 8]

In [None]:
df.shape

In [None]:
# copy a variable - purpose later, see at variable importance
df['n_accommodates_copy'] = df['n_accommodates']

***numerical variables***

In [None]:
# too long to display and read
df.describe().T

In [None]:
df.price.describe(percentiles = [0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]).map('{:,.1f}'.format)

***categorical variables***

In [None]:
df.f_room_type.value_counts()

In [None]:
df.f_property_type.value_counts()

In [None]:
df.f_number_of_reviews.value_counts()

In [None]:
df.f_neighbourhood_cleansed.value_counts()

***split train and test***
- train is where we do it all, incl CV

- first pick a smaller than usual training set so that models run faster and check if works
- if works, start anew without these two lines

In [None]:
df_train, df_holdout = train_test_split( df, train_size=0.7, random_state = 20250224)

In [None]:
df_train.shape, df_holdout.shape

In [None]:
# basic variables inc neighbourhood
basic_vars = [
    "n_accommodates",
    "n_beds",
    "n_days_since",
    "f_property_type",
    "f_room_type",
    "f_bathroom",
    "f_cancellation_policy",
    "f_bed_type",
    "f_neighbourhood_cleansed",
]

# reviews
reviews = [
    "n_number_of_reviews",
    "flag_n_number_of_reviews",
    "n_review_scores_rating",
    "flag_review_scores_rating",
]

# dummy variables
amenities = [col for col in df if col.startswith("d_")]

# interactions for the LASSO
# from ch14
X1 = [
    "n_accommodates:f_property_type",
    "f_room_type:f_property_type",
    "f_room_type:d_familykidfriendly",
    "d_airconditioning:f_property_type",
    "d_cats:f_property_type",
    "d_dogs:f_property_type",
]
# with boroughs
X2 = [
    "f_property_type:f_neighbourhood_cleansed",
    "f_room_type:f_neighbourhood_cleansed",
    "n_accommodates:f_neighbourhood_cleansed",
]

In [None]:
predictors_1 = basic_vars
predictors_2 = basic_vars + reviews + amenities
predictors_E = basic_vars + reviews + amenities + X1 + X2

### Random forest

For data preparation we are using the [patsy](https://patsy.readthedocs.io/en/latest/overview.html) package (not [this](https://montypython.fandom.com/wiki/Patsy) Patsy, bur almost). `patsy` is a Python package for describing statistical models (especially linear models, or models that have a linear component) and building design matrices. It is closely inspired by and compatible with the formula mini-language used in R and S.

In [None]:
y, X = dmatrices("price ~ " + " + ".join(predictors_2), df_train)

`dmatrices()` constructs two design matrices given a formula_like and data. By convention, the first matrix is the “outcome” or “y” data, and the second is the “predictor” or “x” data.

In [None]:
type(X)

In [None]:
X.design_info.column_names

In [None]:
X.shape

In [None]:
import math

In [None]:
print('The theoretical recommended number of variables: {:.2f}.'.format(math.sqrt(len(X.design_info.column_names))))

In [None]:
rfr = RandomForestRegressor(random_state = 20250224)

A random forest is a meta estimator that fits a number of classifying decision trees on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. 

In [None]:
tune_grid = {"max_features": [6, 8, 10, 12], "min_samples_leaf": [5, 10, 15]}

In [None]:
tune_grid

In [None]:
rf_random = GridSearchCV(
    estimator = rfr,
    param_grid = tune_grid,
    cv = 5,
    scoring = "neg_root_mean_squared_error",
    verbose = 3,
)

`GridsearchCV()` is an exhaustive search over specified parameter values for an estimator.

In [None]:
%%time
rf_model = rf_random.fit(X, y.ravel())

<br>

Cross-validated results are saved in the grid search object's `cv_results_` attribute. Note that *RMSE* is displayed as a negative number. 

In [None]:
rf_model.cv_results_

In [None]:
df_rf_model_cv_results = pd.DataFrame(rf_model.cv_results_)[[
    'param_max_features', 'param_min_samples_leaf', 'mean_test_score']]

In [None]:
df_rf_model_cv_results.columns = ['max features', 'min node size', 'RMSE']

In [None]:
df_rf_model_cv_results

In [None]:
df_rf_model_cv_results.pivot(
    index = 'max features', 
    columns = 'min node size', 
    values = 'RMSE').round(2)*-1

In [None]:
rf_model.best_score_

In [None]:
rf_model.best_params_

In [None]:
rf_model.best_estimator_

In [None]:
y_h, X_h = dmatrices("price ~ " + " + ".join(predictors_2), df_holdout)
pred = rf_model.predict(X_h)

In [None]:
from statsmodels.tools.eval_measures import rmse

In [None]:
rmse(y_h, pred)

In [None]:
mean_squared_error(y_h, pred, squared= False)

### Diagnostics

#### feature importances

***individual***

In [None]:
rf_model.best_estimator_.feature_importances_

In [None]:
pd.DataFrame(
    rf_model.best_estimator_.feature_importances_, 
    X.design_info.column_names)

In [None]:
df_var_imp = pd.DataFrame(
    rf_model.best_estimator_.feature_importances_, 
    X.design_info.column_names)\
    .reset_index()\
    .rename({"index": "variable", 0: "imp"}, axis=1)\
    .sort_values(by=["imp"], ascending=False)\
    .reset_index(drop = True)

df_var_imp['cumulative_imp'] = df_var_imp['imp'].cumsum()

In [None]:
df_var_imp.style.format({
    'imp': lambda x: f'{x:,.1%}',
    'cumulative_imp': lambda x: f'{x:,.1%}'})

<br> 

Plotting var imp per se results in a nasty chart.

In [None]:
df_var_imp\
    .sort_values(by = 'imp')\
    .plot(kind = 'barh', 
          x = 'variable', y = 'imp', 
          figsize = (10,10), grid = True, 
          title = 'Random forest model highest feature importances', 
          xlabel = 'variables', legend = False
         );

In [None]:
# we only care for variables with an importance of more than 1 pct
cutoff = 0.01

In [None]:
df_var_imp[df_var_imp.imp > cutoff]\
    .sort_values(by = 'imp')\
    .plot(kind = 'barh', 
          x = 'variable', y = 'imp', 
          figsize = (10,6), grid = True, 
          title = 'Random forest model highest feature importances', 
          xlabel = 'variables', legend = False
         );

Formatting x-axis lables as percentages.

In [None]:
import matplotlib.ticker as mtick

In [None]:
ax = df_var_imp[df_var_imp.imp > cutoff]\
    .sort_values(by = 'imp')\
    .plot(kind = 'barh',
          x = 'variable', 
          y = 'imp',
          figsize = (10,6), 
          grid = True,
          title = 'Random forest model highest feature importances',
          xlabel = 'variables', 
          legend = False)

ax.xaxis.set_major_formatter(mtick.PercentFormatter(xmax=1))

***Grouped variable importance - keep binaries created off factors together***

For this, you need to create an `sklearn` Pipeline including `OneHotEncoding` (before, encoding was done by patsy's `dmatrices`). This way permutation_importance can calculate factor variables' importance 

In [None]:
categorical_columns = [col for col in predictors_2 if col.startswith("f_")]
numerical_columns = [col for col in predictors_2 if col not in categorical_columns]

In [None]:
numerical_columns

In [None]:
categorical_columns

In [None]:
categorical_encoder = OneHotEncoder(handle_unknown="ignore")

preprocessing = ColumnTransformer(
    [("cat", categorical_encoder, categorical_columns),
    ("num", "passthrough", numerical_columns)])

rf_pipeline = Pipeline(
    [("preprocess", preprocessing), 
     ("regressor", rf_model.best_estimator_)] # put best model to pipeline
)

In [None]:
%%time
rf_pipeline.fit(df_train[predictors_2],df_train.price)

`Permutation feature importance` overcomes limitations of the impurity-based feature importance: **they do not have a bias toward high-cardinality features** and can be computed on a left-out test set.

In [None]:
%%time
result = permutation_importance(
    rf_pipeline,
    df_holdout[predictors_2],
    df_holdout.price,
    n_repeats=10,
    random_state=45,
)

In [None]:
result

In [None]:
pd.DataFrame(
        result.importances_mean,
        df_train[predictors_2].columns)

In [None]:
grouped = [
    "f_bed_type",
    "f_property_type",
    "f_room_type",
    "f_bathroom",
    "n_days_since",
    "n_accommodates",
    "n_beds",
    "f_neighbourhood_cleansed",
    "f_cancellation_policy",
]

In [None]:
df_grouped_var_imp = pd.DataFrame(
        result.importances_mean,
        df_train[predictors_2].columns)\
    .loc[grouped]\
    .sort_values(by = 0, ascending = False)\
    .reset_index()\
    .rename({'index': 'variable', 0: 'imp'}, axis = 1)
df_grouped_var_imp['cumulative_imp'] = df_grouped_var_imp.imp.cumsum()

In [None]:
df_grouped_var_imp.style.format({
    'imp': lambda x: f'{x:,.1%}',
    'cumulative_imp': lambda x: f'{x:,.1%}'})

In [None]:
ax = df_grouped_var_imp\
    .sort_values(by = 'imp')\
    .plot(kind = 'barh', 
          x = 'variable', y = 'imp', 
          figsize = (10,6), grid = True, 
          title = 'Random forest model grouped feature importances', 
          xlabel = 'variables', legend = False
         )
ax.xaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals = 0));

In [None]:
df_clean_varimp = pd.DataFrame(
        result.importances_mean,
        df_train[predictors_2].columns)\
    .sort_values(by = 0, ascending = False)\
    .reset_index()\
    .rename({'index': 'variable', 0: 'imp'}, axis = 1)

df_clean_varimp['cumulative_imp'] = df_var_imp['imp'].cumsum()
df_clean_varimp[df_clean_varimp.cumulative_imp < 0.91]

In [None]:
ax = df_clean_varimp.iloc[0:10]\
    .sort_values(by = 'imp')\
    .plot(kind = 'barh', 
          x = 'variable', y = 'imp', 
          figsize = (10,6), grid = True, 
          title = 'Random forest model top 10 feature importances with grouped variables', 
          xlabel = 'variables', legend = False
         )
ax.xaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals = 0));

### Partial dependence plots 

Note: easy way, sklearn has plot_partial_dependence function we do this on holdout set!   
Also, note that we run it not on the `rf_model` but on the `rf_pipeline` to manage OneHot_Encoding on the fly.

In [None]:
accomodates_pdp = partial_dependence(
    rf_pipeline, df_holdout[predictors_2], ["n_accommodates"], kind="average"
)

In [None]:
accomodates_pdp

In [None]:
# We need to access the elements of this complex data structure
type(accomodates_pdp)

In [None]:
pd.DataFrame(
    {'number of accomodates': accomodates_pdp['values'][0], 
     'average price': accomodates_pdp['average'][0]}
    )

We can do our pdp plots using Pandas built-in plot method.

In [None]:
pd.DataFrame(
    {'number of accomodates': accomodates_pdp['values'][0], 
     'average price': accomodates_pdp['average'][0]}
    ).sort_values(by = 'average price').plot(
    kind = 'line', color = 'k', marker = 'o', markersize = 10, linewidth = 0,
    figsize = (10,6), legend = False, grid = True,
    x = 'number of accomodates', y = 'average price', ylim = (0, 140), 
    title = 'Partial dependence plot: number of accomodates'
);

`sklearn` has its own visualization with complicated syntax. 

In [None]:
display = PartialDependenceDisplay(
    pd_results = [accomodates_pdp],
    features = [(0,)], 
    feature_names = df_holdout[predictors_2].columns.tolist(), 
    target_idx = 0,
    deciles = {0: np.linspace(1, 7, num=7)}
)
display.plot()
plt.title('Partial dependence plot for n_accomodates')
plt.show();

In [None]:
roomtype_pdp = partial_dependence(
    rf_pipeline, df_holdout[predictors_2], ["f_room_type"], kind="average"
)

In [None]:
pd.DataFrame(
    {'room type': roomtype_pdp['values'][0], 
     'average price': roomtype_pdp['average'][0]}
    ).sort_values(by = 'average price').plot(
    kind = 'line', color = 'k', marker = 'o', markersize = 10, linewidth = 0,
    figsize = (6,5), legend = False, grid = True,
    x = 'room type', y = 'average price', ylim = (0, 140), 
    title = 'Partial dependence plot: room type'
);

#### Subsample performance: RMSE / mean(y) 

NOTE:  we do this on the holdout set, using the encoding pipeline `rf_pipeline` again.


In [None]:
df_holdout_w_prediction = df_holdout.assign(
    predicted_price=rf_pipeline.predict(df_holdout[predictors_2])
)

***Creating tables of heterogeneity by various grouping factors***
- apartman size

This is how we start:

In [None]:
df_holdout_w_prediction['is_low_size'] = df_holdout_w_prediction.n_accommodates.map(lambda x: 'small apt' if x < 3 else 'large apt')

In [None]:
df_holdout_w_prediction.iloc[0:5, -5:]

In [None]:
df_holdout_w_prediction.groupby('is_low_size').apply(lambda x: mean_squared_error(x.predicted_price, x.price, squared=False))

Putting it in a function with additional columns

In [None]:
def calculate_rmse(groupby_obj):
    return (
        groupby_obj.apply(
            lambda x: mean_squared_error(x.predicted_price, x.price, squared=False),
        )
        .to_frame(name="rmse")
        .assign(mean_price=groupby_obj.apply(lambda x: np.mean(x.price)).values)
        .assign(rmse_normalized=lambda x: x.rmse / x.mean_price).round(2)
    )

In [None]:
# cheaper or more expensive flats - not used in book
grouped_object = df_holdout_w_prediction.assign(
    is_low_size=lambda x: np.where(x.n_accommodates <= 3, "small apt", "large apt")
).groupby("is_low_size")
accom_subset = calculate_rmse(grouped_object)

In [None]:
accom_subset

- fancy neighborhoods

In [None]:
grouped_object = df_holdout_w_prediction.loc[
    lambda x: x.f_neighbourhood_cleansed.isin(
        [
            "Westminster",
            "Camden",
            "Kensington and Chelsea",
            "Tower Hamlets",
            "Hackney",
            "Newham",
        ]
    )
].groupby("f_neighbourhood_cleansed")
neightbourhood_subset = calculate_rmse(grouped_object)

neightbourhood_subset.sort_values(by = 'mean_price', ascending = False).style.format({'rmse': '{:.1f}', 'mean_price': '{:.1f}', 'rmse_normalized': '{:.2f}'})

- property type

In [None]:
grouped_object = df_holdout_w_prediction.loc[
    lambda x: x.f_property_type.isin(["Apartment", "House"])
].groupby("f_property_type")
proptype_subset = calculate_rmse(grouped_object)

proptype_subset.style.format({'rmse': '{:.1f}', 'mean_price': '{:.1f}', 'rmse_normalized': '{:.2f}'})

In [None]:
all_holdout = pd.DataFrame(
    [
        mean_squared_error(
            df_holdout_w_prediction.price,
            df_holdout_w_prediction.predicted_price,
            squared=False,
        ),
        df_holdout_w_prediction.price.mean(),
    ],
    index=["rmse", "mean_price"],
).T.assign(rmse_normalized=lambda x: x.rmse / x.mean_price).round(2)
all_holdout.index = ["Total"]

all_holdout.style.format({'rmse': '{:.1f}', 'mean_price': '{:.1f}', 'rmse_normalized': '{:.2f}'})

In [None]:
type_rows = pd.DataFrame(
    None,
    index=["Apartment size", "Type", "Borough", "------"],
    columns=["rmse", "mean_price", "rmse_normalized"],
).fillna("------")

In [None]:
type_rows

**Finally: subsample performance**

In [None]:
pd.concat(
    [
        type_rows.iloc[[0]],
        accom_subset,
        type_rows.iloc[[1]],
        proptype_subset,
        type_rows.iloc[[2]],
        neightbourhood_subset,
        type_rows.iloc[[3]],
        all_holdout,
    ]
)

### Horserace: compare with other models 


1. ***OLS with dummies for area***

 using model B

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
y, X = dmatrices("price ~ " + " + ".join(predictors_2), df_train)

ols_model = LinearRegression().fit(X,y)

#y_test, X_test = dmatrices("price ~ " + " + ".join(predictors_2), df_holdout)

y_hat = ols_model.predict(X)

ols_rmse = mean_squared_error(y,y_hat,squared=False)
ols_rmse

In [None]:
ols_model_coeffs_df = pd.DataFrame(
    ols_model.coef_.tolist()[0],
    index=X.design_info.column_names,
    columns=["ols_coefficient"],
).assign(ols_coefficient=lambda x: x.ols_coefficient.round(3))

In [None]:
ols_model_coeffs_df

2.  LASSO

using extended model w interactions

In [None]:
from sklearn.linear_model import ElasticNet

The `ElasticNet` model is combines L1 (LASSO) and L2 (Ridge) in a single class. 

The parameter `l1_ratio` (between [0,1]) is the weight of LASSO and Ridge. l1_ratio = 1 is the pure lasso penalty. Currently, l1_ratio <= 0.01 is not reliable, unless you supply your own sequence of alpha.

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html

In [None]:
lasso_model =  ElasticNet(l1_ratio = 1, fit_intercept = True)

In [None]:
lasso_model_cv = GridSearchCV(
    lasso_model,
    # {"alpha":[i/100 for i in range(1, 26, 1)]}, #> this option takes forever to run
    {"alpha":[i/100 for i in range(5, 26, 5)]},
    cv=5,
    scoring="neg_root_mean_squared_error",
    verbose=3,
)


In [None]:
y, X = dmatrices("price ~ " + " + ".join(predictors_E), df_train)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
%%time
lasso_model_cv.fit(X_scaled, y.ravel())

In [None]:
lasso_model_cv.best_estimator_

In [None]:
lasso_rmse = pd.DataFrame(lasso_model_cv.cv_results_).loc[
    lambda x: x.param_alpha == lasso_model_cv.best_estimator_.alpha
].mean_test_score.values[0] * -1
lasso_rmse

3. ***CART model***

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
y, X = dmatrices("price ~ " + " + ".join(predictors_2), df_train)

In [None]:
X.shape

In [None]:
cart_model = DecisionTreeRegressor(random_state=20250224)

Get potential ccp_alpha parameters

In [None]:
path = cart_model.cost_complexity_pruning_path(X, y.ravel())
ccp_alphas, impurities = path.ccp_alphas, path.impurities

Minimal cost-complexity pruning is an algorithm used to prune a tree to avoid over-fitting. This algorithm is parameterized by 
 known as the complexity parameter. The complexity parameter is used to define the cost-complexity measure.

<center>
 $R_a(T) = R(T) + \alpha(\hat{T})$   
</center>
 
 where $R(T)$ is the cost-complexity measure, and $\alpha(\hat{T})$ is the number of terminal nodes of tree $T$. 

 By defalut `DecisionTreeRegressor` uses `squared error` as criterion for goodness-of-fit.

In [None]:
ccp_alphas

In [None]:
ccp_alphas.shape

In [None]:
impurities

Apply random search to select a "best" alpha, default is 10 iterations
`RandomizedSearchCV` does not calculate all potential alphas, just a random 10-element subset of the many potential alphas

In [None]:
%%time

cart_model_cv = RandomizedSearchCV(
    cart_model,
    {"ccp_alpha":ccp_alphas},
    cv = 5,
    scoring="neg_root_mean_squared_error",
    verbose = 3,
)
cart_model_cv.fit(X,y.ravel())

In [None]:
cart_model_cv.best_estimator_

In [None]:
cart_rmse = pd.DataFrame(cart_model_cv.cv_results_).loc[
    lambda x: x.param_ccp_alpha == cart_model_cv.best_estimator_.ccp_alpha
].mean_test_score.values[0] * -1
cart_rmse

4. GBM

**NOTE:** With complex grid search run for a **very long time**

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
gbm = GradientBoostingRegressor(learning_rate=0.1, min_samples_split=20, max_features = 10
                                #, n_estimators = 50
                               )

tune_grid = {"n_estimators": [200, 300], "max_depth": [5, 10]}

gbm_model_cv = GridSearchCV(
    gbm,
    tune_grid,
    cv=5,
    scoring="neg_root_mean_squared_error",
    verbose=10,
    n_jobs=-1
)

In [None]:
categorical_columns = [col for col in predictors_2 if col.startswith("f_")]
numerical_columns = [col for col in predictors_2 if col not in categorical_columns]

categorical_encoder = OneHotEncoder(handle_unknown="ignore")

preprocessing = ColumnTransformer(
    [
        ("cat", categorical_encoder, categorical_columns),
        ("num", "passthrough", numerical_columns),
    ]
)

gbm_pipe = Pipeline(
    [("preprocess", preprocessing), ("regressor", gbm_model_cv)], verbose=True
)

In [None]:
%%time
gbm_pipe.fit(df_train[predictors_2],df_train.price)

In [None]:
gbm_model_cv.best_estimator_

In [None]:
gbm_rmse = gbm_model_cv.best_score_*-1

In [None]:
gbm_rmse

Comparing model results on CV RMSE.

In [None]:
pd.DataFrame({'model': ['OLS', 'LASSO', 'CART', 'random forest', 'GBM'],
              'CV RMSE': [ols_rmse, lasso_rmse, cart_rmse, all_holdout.rmse[0], gbm_rmse]})

***Histogram-based Gradient Boosting Regression Tree***

Histogram-based Gradient Boosting Regression Tree. It is experimental so we need to enable experimental features first. This implementation is inspired by [LightGBM](https://github.com/Microsoft/LightGBM).

In [None]:
from sklearn.ensemble import HistGradientBoostingRegressor

In [None]:
hgbm_broad = HistGradientBoostingRegressor(random_state = 20250224)

In [None]:
"""
tune_grid = {
    "max_iter": [50, 100, 200],
    "max_depth": [1, 5, 10],
    "learning_rate": [0.1, 0.15, 0.2],
    "min_samples_leaf": [5, 10, 20, 30],
}
"""
# for the sake of simplicity we run on only a restricted tuning set on class
tune_grid = {
    # "max_iter": [50, 100, 200],
    "max_iter": [200],
    # "max_depth": [1, 5, 10],
    "max_depth": [5, 10],
    # "learning_rate": [0.1, 0.15, 0.2],
    "learning_rate": [0.1, 0.15],
    "min_samples_leaf": [10, 20],
}

hgbm_model_cv_broad = GridSearchCV(
    hgbm_broad,
    tune_grid,
    cv=5,
    scoring="neg_root_mean_squared_error",
    verbose=10,
)

In [None]:
categorical_columns = [col for col in predictors_2 if col.startswith("f_")]
numerical_columns = [col for col in predictors_2 if col not in categorical_columns]

categorical_encoder = OneHotEncoder(handle_unknown="ignore")

preprocessing = ColumnTransformer(
    [
        ("cat", categorical_encoder, categorical_columns),
        ("num", "passthrough", numerical_columns),
    ]
)

hgbm_pipe_broad = Pipeline(
    [("preprocess", preprocessing), ("regressor", hgbm_model_cv_broad)], verbose=True
)

In [None]:
%%time
hgbm_pipe_broad.fit(df_train[predictors_2],df_train.price)

In [None]:
hgbm_model_cv_broad.best_params_

In [None]:
print('Broad HGBM RMSE is: {:.4f}.'.format(hgbm_model_cv_broad.best_score_*-1))
print('Simple GBM RMSE is: {:.4f}.'.format(gbm_model_cv.best_score_*-1))