# Gradient Boosting Decision Trees

In [2]:
import warnings 
warnings.filterwarnings ("ignore")

import numpy as np
import pandas as pd
import sklearn
import lightgbm as lgb
import xgboost as xgb
import json

from utils import (Timer, load_airline, convert_related_cols_categorical_to_numeric, 
                   convert_cols_categorical_to_numeric)

print(f"Numpy version:{np.__version__}")
print(f"Pandas version:{pd.__version__}")
print(f"Sklearn version:{sklearn.__version__}")
print(f"LightGBM version:{lgb.__version__}")
print(f"XGBoost version:{xgb.__version__}")

Numpy version:1.22.3
Pandas version:1.4.1
Sklearn version:1.0.2
LightGBM version:3.3.2
XGBoost version:1.5.2


## Airline dataset

```bash
cd data
wget http://kt.ijs.si/elena_ikonomovska/datasets/airline/airline_14col.data.bz2
bzip2 -dk airline_14col.data.bz2
```

In [None]:
%%time
df_plane = load_airline()
print(df_plane.shape)

In [None]:
df_plane.head()

The first step is to convert the categorical features to numeric features.

In [None]:
%%time
df_plane_numeric = convert_related_cols_categorical_to_numeric(df_plane, col_list=['Origin','Dest'])
del df_plane

In [None]:
df_plane_numeric.head()

In [None]:
%%time
df_plane_numeric = convert_cols_categorical_to_numeric(df_plane_numeric, col_list='UniqueCarrier')

To simplify the pipeline, we are going to set a classification problem where the goal is to classify wheather a flight has arrived delayed or not. For that we need to binarize the variable `ArrDelay`.

If you want to extend this experiment, you can set a regression problem and try to identify the number of minutes of delay a fight has. Both XGBoost and LightGBM have regression classes.

In [None]:
df_plane_numeric = df_plane_numeric.apply(lambda x: x.astype('int16'))

In [None]:
%%time
df_plane_numeric['ArrDelayBinary'] = 1*(df_plane_numeric['ArrDelay'] > 0)

In [None]:
df_plane_numeric.head()

Once the features are prepared, let's split the dataset into train and test set. We won't use validation for this example (however, you can try to add it).

In [None]:
def split_train_val_test_df(df, val_size=0.2, test_size=0.2):
    train, validate, test = np.split(
        df.sample(frac=1),
        [int((1 - val_size - test_size) * len(df)), int((1 - test_size) * len(df))],
    )
    return train, validate, test

def generate_feables(df):
    X = df[df.columns.difference(['ArrDelay', 'ArrDelayBinary'])]
    y = df['ArrDelayBinary']
    return X,y

In [None]:
%%time
train, validate, test = split_train_val_test_df(df_plane_numeric, val_size=0, test_size=0.2)
print(train.shape)
print(validate.shape)
print(test.shape)

In [None]:
%%time
X_train, y_train = generate_feables(train)
X_val, y_val = generate_feables(validate)
X_test, y_test = generate_feables(test)

In [None]:
del train, validate, test

## Training

Now we are going to create two pipelines, one of XGBoost and one for LightGBM. The technology behind both libraries is different, so it is difficult to compare them in the exact same model setting. XGBoost grows the trees depth-wise and controls model complexity with `max_depth`. Instead, LightGBM uses a leaf-wise algorithm and controls the model complexity by `num_leaves`. As a tradeoff, we use XGBoost with `max_depth=8`, which will have max number leaves of 255, and compare it with LightGBM with `num_leaves=255`.

In [None]:
results_dict = dict()
num_rounds = 200

Let's start with the XGBoost classifier.

In [None]:
xgb_clf_pipeline = xgb.XGBRegressor(max_depth=8,
                                    n_estimators=num_rounds,
                                    min_child_weight=30,
                                    learning_rate=0.1,
                                    scale_pos_weight=2,
                                    gamma=0.1,
                                    reg_lambda=1,
                                    subsample=1,
                                    n_jobs=number_processors,
                                    random_state=77)

In [None]:
with Timer() as t:
    xgb_clf_pipeline.fit(X_train, y_train)

In [None]:
results_dict['xgb']={ 'train_time': t.interval }

Training XGBoost model with leave-wise growth

In [None]:
xgb_hist_clf_pipeline = xgb.XGBRegressor(max_depth=0,
                                        max_leaves=255,
                                        n_estimators=num_rounds,
                                        min_child_weight=30,
                                        learning_rate=0.1,
                                        scale_pos_weight=2,
                                        gamma=0.1,
                                        reg_lambda=1,
                                        subsample=1,
                                        grow_policy='lossguide',
                                        tree_method='hist',
                                        n_jobs=number_processors,
                                        random_state=77)

In [None]:
with Timer() as t:
    xgb_hist_clf_pipeline.fit(X_train, y_train)

In [None]:
results_dict['xgb_hist']={ 'train_time': t.interval }

Training LightGBM model.

In [None]:
lgbm_clf_pipeline = LGBMRegressor(num_leaves=255,
                                  n_estimators=num_rounds,
                                  min_child_weight=30,
                                  learning_rate=0.1,
                                  scale_pos_weight=2,
                                  min_split_gain=0.1,
                                  reg_lambda=1,
                                  subsample=1,
                                  nthread=number_processors,
                                  seed=77)

In [None]:
with Timer() as t:
    lgbm_clf_pipeline.fit(X_train, y_train)

In [None]:
results_dict['lgbm']={ 'train_time': t.interval }

## Evaluation

Now let's evaluate the model in the test set.

In [None]:
with Timer() as t:
    y_prob_xgb = np.clip(xgb_clf_pipeline.predict(X_test), 0.0001, 0.9999)

In [None]:
results_dict['xgb']['test_time'] = t.interval

In [None]:
with Timer() as t:
    y_prob_xgb_hist = np.clip(xgb_hist_clf_pipeline.predict(X_test), 0.0001, 0.9999)

In [None]:
results_dict['xgb_hist']['test_time'] = t.interval

In [None]:
with Timer() as t:
    y_prob_lgbm = np.clip(lgbm_clf_pipeline.predict(X_test), 0.0001, 0.9999)

In [None]:
results_dict['lgbm']['test_time'] = t.interval

## Metrics

We are going to obtain some metrics to evaluate the performance of each of the models.

In [None]:
y_pred_xgb = binarize_prediction(y_prob_xgb)
y_pred_xgb_hist = binarize_prediction(y_prob_xgb_hist)
y_pred_lgbm = binarize_prediction(y_prob_lgbm)

In [None]:
report_xgb = classification_metrics_binary(y_test, y_pred_xgb)
report2_xgb = classification_metrics_binary_prob(y_test, y_prob_xgb)
report_xgb.update(report2_xgb)

In [None]:
results_dict['xgb']['performance'] = report_xgb

In [None]:
report_xgb_hist = classification_metrics_binary(y_test, y_pred_xgb_hist)
report2_xgb_hist = classification_metrics_binary_prob(y_test, y_prob_xgb_hist)
report_xgb_hist.update(report2_xgb_hist)

In [None]:
results_dict['xgb_hist']['performance'] = report_xgb_hist

In [None]:
report_lgbm = classification_metrics_binary(y_test, y_pred_lgbm)
report2_lgbm = classification_metrics_binary_prob(y_test, y_prob_lgbm)
report_lgbm.update(report2_lgbm)

In [None]:
results_dict['lgbm']['performance'] = report_lgbm

Results

In [None]:
print(json.dumps(results_dict, indent=4, sort_keys=True))

In [None]:
del xgb_clf_pipeline, xgb_hist_clf_pipeline, lgbm_clf_pipeline, X_train, X_test, X_val