Skip to content

Commit

Permalink
[MRG] Added pygbm-to-lightgbm model conversion (#61)
Browse files Browse the repository at this point in the history
Created `utils.py` with a utility that builds a lightgbm estimator with the same hyperparameters of a pygbm estimator.
  • Loading branch information
NicolasHug committed Dec 10, 2018
1 parent a52735d commit 974f8af
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 47 deletions.
30 changes: 14 additions & 16 deletions benchmarks/bench_higgs_boson.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from sklearn.metrics import accuracy_score, roc_auc_score
from joblib import Memory
from pygbm import GradientBoostingClassifier
from pygbm.utils import get_lightgbm_estimator
from lightgbm import LGBMClassifier
import numba

Expand Down Expand Up @@ -64,21 +65,6 @@ def load_data():
n_samples, n_features = data_train.shape
print(f"Training set with {n_samples} records with {n_features} features.")

if not args.no_lightgbm:
print("Fitting a LightGBM model...")
tic = time()
lightgbm_model = LGBMClassifier(objective='binary',
n_estimators=n_trees,
num_leaves=n_leaf_nodes,
learning_rate=lr, verbose=10,
min_data_in_bin=1)
lightgbm_model.fit(data_train, target_train)
toc = time()
predicted_test = lightgbm_model.predict(data_test)
roc_auc = roc_auc_score(target_test, predicted_test)
acc = accuracy_score(target_test, predicted_test)
print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")

print("JIT compiling code for the pygbm model...")
tic = time()
pygbm_model = GradientBoostingClassifier(learning_rate=lr, max_iter=1,
Expand All @@ -93,7 +79,8 @@ def load_data():

print("Fitting a pygbm model...")
tic = time()
pygbm_model = GradientBoostingClassifier(learning_rate=lr, max_iter=n_trees,
pygbm_model = GradientBoostingClassifier(loss='binary_crossentropy',
learning_rate=lr, max_iter=n_trees,
max_bins=max_bins,
max_leaf_nodes=n_leaf_nodes,
random_state=0, scoring=None,
Expand All @@ -107,3 +94,14 @@ def load_data():

if hasattr(numba, 'threading_layer'):
print("Threading layer chosen: %s" % numba.threading_layer())

if not args.no_lightgbm:
print("Fitting a LightGBM model...")
tic = time()
lightgbm_model = get_lightgbm_estimator(pygbm_model)
lightgbm_model.fit(data_train, target_train)
toc = time()
predicted_test = lightgbm_model.predict(data_test)
roc_auc = roc_auc_score(target_test, predicted_test)
acc = accuracy_score(target_test, predicted_test)
print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")
55 changes: 55 additions & 0 deletions pygbm/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from lightgbm import LGBMRegressor
from lightgbm import LGBMClassifier

from .gradient_boosting import GradientBoostingClassifier


def get_lightgbm_estimator(pygbm_estimator):
"""Return an unfitted LightGBM estimator with matching hyperparams.
This utility function takes care of renaming the PyGBM parameters into
their LightGBM equivalent parameters.
"""

pygbm_params = pygbm_estimator.get_params()

if pygbm_params['loss'] == 'auto':
raise ValueError('auto loss is not accepted. We need to know if '
'the problem is binary or multiclass classification.')
if pygbm_params['scoring'] is not None:
raise NotImplementedError('Early stopping should be deactivated.')

loss_mapping = {
'least_squares': 'regression_l2',
'binary_crossentropy': 'binary',
'categorical_crossentropy': 'multiclass'
}

lgbm_params = {
'objective': loss_mapping[pygbm_params['loss']],
'learning_rate': pygbm_params['learning_rate'],
'n_estimators': pygbm_params['max_iter'],
'num_leaves': pygbm_params['max_leaf_nodes'],
'max_depth': pygbm_params['max_depth'],
'min_data_in_leaf': pygbm_params['min_samples_leaf'],
'lambda_l2': pygbm_params['l2_regularization'],
'max_bin': pygbm_params['max_bins'],
'min_data_in_bin': 1,
'min_sum_hessian_in_leaf': 1e-3,
'min_gain_to_split': 0,
'verbosity': 10 if pygbm_params['verbose'] else 0
}
# TODO: change hardcoded values when / if they're arguments to the
# estimator.

if pygbm_params['loss'] == 'categorical_crossentropy':
# LGBM multiplies hessians by 2 in multiclass loss.
lgbm_params['min_sum_hessian_in_leaf'] *= 2
lgbm_params['learning_rate'] *= 2

if isinstance(pygbm_estimator, GradientBoostingClassifier):
Est = LGBMClassifier
else:
Est = LGBMRegressor

return Est(**lgbm_params)
39 changes: 8 additions & 31 deletions tests/test_compare_lightgbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@

from pygbm import GradientBoostingRegressor, GradientBoostingClassifier
from pygbm.binning import BinMapper
from pygbm.utils import get_lightgbm_estimator


pytest.importorskip("lightgbm")


@pytest.mark.parametrize('seed', range(5))
Expand All @@ -32,8 +36,6 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples,
# - To ignore discrepancies caused by small differences the binning
# strategy, data is pre-binned if n_samples > 255.

lb = pytest.importorskip("lightgbm")

rng = np.random.RandomState(seed=seed)
n_samples = n_samples
max_iter = 1
Expand All @@ -47,18 +49,13 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples,

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

est_lightgbm = lb.LGBMRegressor(n_estimators=max_iter,
min_data_in_bin=1,
max_bin=max_bins,
learning_rate=1,
min_data_in_leaf=min_samples_leaf,
num_leaves=max_leaf_nodes)
est_pygbm = GradientBoostingRegressor(max_iter=max_iter,
max_bins=max_bins,
learning_rate=1,
validation_split=None, scoring=None,
min_samples_leaf=min_samples_leaf,
max_leaf_nodes=max_leaf_nodes)
est_lightgbm = get_lightgbm_estimator(est_pygbm)

est_lightgbm.fit(X_train, y_train)
est_pygbm.fit(X_train, y_train)
Expand All @@ -85,8 +82,6 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
max_leaf_nodes):
# Same as test_same_predictions_regression but for classification

lb = pytest.importorskip("lightgbm")

rng = np.random.RandomState(seed=seed)
n_samples = n_samples
max_iter = 1
Expand All @@ -100,13 +95,6 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples,

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

est_lightgbm = lb.LGBMClassifier(objective='binary',
n_estimators=max_iter,
min_data_in_bin=1,
max_bin=max_bins,
learning_rate=1,
min_data_in_leaf=min_samples_leaf,
num_leaves=max_leaf_nodes)
est_pygbm = GradientBoostingClassifier(loss='binary_crossentropy',
max_iter=max_iter,
max_bins=max_bins,
Expand All @@ -115,6 +103,7 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
scoring=None,
min_samples_leaf=min_samples_leaf,
max_leaf_nodes=max_leaf_nodes)
est_lightgbm = get_lightgbm_estimator(est_pygbm)

est_lightgbm.fit(X_train, y_train)
est_pygbm.fit(X_train, y_train)
Expand Down Expand Up @@ -148,12 +137,11 @@ def test_same_predictions_multiclass_classification(
seed, min_samples_leaf, n_samples, max_leaf_nodes):
# Same as test_same_predictions_regression but for classification

lb = pytest.importorskip("lightgbm")

rng = np.random.RandomState(seed=seed)
n_samples = n_samples
max_iter = 1
max_bins = 256
lr = 1

X, y = make_classification(n_samples=n_samples, n_classes=3, n_features=5,
n_informative=5, n_redundant=0,
Expand All @@ -164,18 +152,6 @@ def test_same_predictions_multiclass_classification(

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

# LightGBM multiplies the hessians by 2 so we need to double the learning
# rate. We could also do that for min_hessian_to_split.
lr = 1
lr_lightgbm = lr * 2

est_lightgbm = lb.LGBMClassifier(objective='multiclass',
n_estimators=max_iter,
min_data_in_bin=1,
max_bin=max_bins,
learning_rate=lr_lightgbm,
min_data_in_leaf=min_samples_leaf,
num_leaves=max_leaf_nodes)
est_pygbm = GradientBoostingClassifier(loss='categorical_crossentropy',
max_iter=max_iter,
max_bins=max_bins,
Expand All @@ -184,6 +160,7 @@ def test_same_predictions_multiclass_classification(
scoring=None,
min_samples_leaf=min_samples_leaf,
max_leaf_nodes=max_leaf_nodes)
est_lightgbm = get_lightgbm_estimator(est_pygbm)

est_lightgbm.fit(X_train, y_train)
est_pygbm.fit(X_train, y_train)
Expand Down

0 comments on commit 974f8af

Please sign in to comment.