[MRG] Added pygbm-to-lightgbm model conversion (#61)

Created `utils.py` with a utility that builds a lightgbm estimator with the same hyperparameters of a pygbm estimator.
ogrisel · Dec 10, 2018 · 974f8af · 974f8af
1 parent a52735d
commit 974f8af
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 47 deletions.
diff --git a/benchmarks/bench_higgs_boson.py b/benchmarks/bench_higgs_boson.py
@@ -10,6 +10,7 @@
 from sklearn.metrics import accuracy_score, roc_auc_score
 from joblib import Memory
 from pygbm import GradientBoostingClassifier
+from pygbm.utils import get_lightgbm_estimator
 from lightgbm import LGBMClassifier
 import numba
 
@@ -64,21 +65,6 @@ def load_data():
 n_samples, n_features = data_train.shape
 print(f"Training set with {n_samples} records with {n_features} features.")
 
-if not args.no_lightgbm:
-    print("Fitting a LightGBM model...")
-    tic = time()
-    lightgbm_model = LGBMClassifier(objective='binary',
-                                    n_estimators=n_trees,
-                                    num_leaves=n_leaf_nodes,
-                                    learning_rate=lr, verbose=10,
-                                    min_data_in_bin=1)
-    lightgbm_model.fit(data_train, target_train)
-    toc = time()
-    predicted_test = lightgbm_model.predict(data_test)
-    roc_auc = roc_auc_score(target_test, predicted_test)
-    acc = accuracy_score(target_test, predicted_test)
-    print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")
-
 print("JIT compiling code for the pygbm model...")
 tic = time()
 pygbm_model = GradientBoostingClassifier(learning_rate=lr, max_iter=1,
@@ -93,7 +79,8 @@ def load_data():
 
 print("Fitting a pygbm model...")
 tic = time()
-pygbm_model = GradientBoostingClassifier(learning_rate=lr, max_iter=n_trees,
+pygbm_model = GradientBoostingClassifier(loss='binary_crossentropy',
+                                         learning_rate=lr, max_iter=n_trees,
                                          max_bins=max_bins,
                                          max_leaf_nodes=n_leaf_nodes,
                                          random_state=0, scoring=None,
@@ -107,3 +94,14 @@ def load_data():
 
 if hasattr(numba, 'threading_layer'):
     print("Threading layer chosen: %s" % numba.threading_layer())
+
+if not args.no_lightgbm:
+    print("Fitting a LightGBM model...")
+    tic = time()
+    lightgbm_model = get_lightgbm_estimator(pygbm_model)
+    lightgbm_model.fit(data_train, target_train)
+    toc = time()
+    predicted_test = lightgbm_model.predict(data_test)
+    roc_auc = roc_auc_score(target_test, predicted_test)
+    acc = accuracy_score(target_test, predicted_test)
+    print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")
diff --git a/pygbm/utils.py b/pygbm/utils.py
@@ -0,0 +1,55 @@
+from lightgbm import LGBMRegressor
+from lightgbm import LGBMClassifier
+
+from .gradient_boosting import GradientBoostingClassifier
+
+
+def get_lightgbm_estimator(pygbm_estimator):
+    """Return an unfitted LightGBM estimator with matching hyperparams.
+
+    This utility function takes care of renaming the PyGBM parameters into
+    their LightGBM equivalent parameters.
+    """
+
+    pygbm_params = pygbm_estimator.get_params()
+
+    if pygbm_params['loss'] == 'auto':
+        raise ValueError('auto loss is not accepted. We need to know if '
+                         'the problem is binary or multiclass classification.')
+    if pygbm_params['scoring'] is not None:
+        raise NotImplementedError('Early stopping should be deactivated.')
+
+    loss_mapping = {
+        'least_squares': 'regression_l2',
+        'binary_crossentropy': 'binary',
+        'categorical_crossentropy': 'multiclass'
+    }
+
+    lgbm_params = {
+        'objective': loss_mapping[pygbm_params['loss']],
+        'learning_rate': pygbm_params['learning_rate'],
+        'n_estimators': pygbm_params['max_iter'],
+        'num_leaves': pygbm_params['max_leaf_nodes'],
+        'max_depth': pygbm_params['max_depth'],
+        'min_data_in_leaf': pygbm_params['min_samples_leaf'],
+        'lambda_l2': pygbm_params['l2_regularization'],
+        'max_bin': pygbm_params['max_bins'],
+        'min_data_in_bin': 1,
+        'min_sum_hessian_in_leaf': 1e-3,
+        'min_gain_to_split': 0,
+        'verbosity': 10 if pygbm_params['verbose'] else 0
+    }
+    # TODO: change hardcoded values when / if they're arguments to the
+    # estimator.
+
+    if pygbm_params['loss'] == 'categorical_crossentropy':
+        # LGBM multiplies hessians by 2 in multiclass loss.
+        lgbm_params['min_sum_hessian_in_leaf'] *= 2
+        lgbm_params['learning_rate'] *= 2
+
+    if isinstance(pygbm_estimator, GradientBoostingClassifier):
+        Est = LGBMClassifier
+    else:
+        Est = LGBMRegressor
+
+    return Est(**lgbm_params)
diff --git a/tests/test_compare_lightgbm.py b/tests/test_compare_lightgbm.py
@@ -6,6 +6,10 @@
 
 from pygbm import GradientBoostingRegressor, GradientBoostingClassifier
 from pygbm.binning import BinMapper
+from pygbm.utils import get_lightgbm_estimator
+
+
+pytest.importorskip("lightgbm")
 
 
 @pytest.mark.parametrize('seed', range(5))
@@ -32,8 +36,6 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples,
     # - To ignore  discrepancies caused by small differences the binning
     #   strategy, data is pre-binned if n_samples > 255.
 
-    lb = pytest.importorskip("lightgbm")
-
     rng = np.random.RandomState(seed=seed)
     n_samples = n_samples
     max_iter = 1
@@ -47,18 +49,13 @@ def test_same_predictions_regression(seed, min_samples_leaf, n_samples,
 
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
 
-    est_lightgbm = lb.LGBMRegressor(n_estimators=max_iter,
-                                    min_data_in_bin=1,
-                                    max_bin=max_bins,
-                                    learning_rate=1,
-                                    min_data_in_leaf=min_samples_leaf,
-                                    num_leaves=max_leaf_nodes)
     est_pygbm = GradientBoostingRegressor(max_iter=max_iter,
                                           max_bins=max_bins,
                                           learning_rate=1,
                                           validation_split=None, scoring=None,
                                           min_samples_leaf=min_samples_leaf,
                                           max_leaf_nodes=max_leaf_nodes)
+    est_lightgbm = get_lightgbm_estimator(est_pygbm)
 
     est_lightgbm.fit(X_train, y_train)
     est_pygbm.fit(X_train, y_train)
@@ -85,8 +82,6 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
                                          max_leaf_nodes):
     # Same as test_same_predictions_regression but for classification
 
-    lb = pytest.importorskip("lightgbm")
-
     rng = np.random.RandomState(seed=seed)
     n_samples = n_samples
     max_iter = 1
@@ -100,13 +95,6 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
 
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
 
-    est_lightgbm = lb.LGBMClassifier(objective='binary',
-                                     n_estimators=max_iter,
-                                     min_data_in_bin=1,
-                                     max_bin=max_bins,
-                                     learning_rate=1,
-                                     min_data_in_leaf=min_samples_leaf,
-                                     num_leaves=max_leaf_nodes)
     est_pygbm = GradientBoostingClassifier(loss='binary_crossentropy',
                                            max_iter=max_iter,
                                            max_bins=max_bins,
@@ -115,6 +103,7 @@ def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
                                            scoring=None,
                                            min_samples_leaf=min_samples_leaf,
                                            max_leaf_nodes=max_leaf_nodes)
+    est_lightgbm = get_lightgbm_estimator(est_pygbm)
 
     est_lightgbm.fit(X_train, y_train)
     est_pygbm.fit(X_train, y_train)
@@ -148,12 +137,11 @@ def test_same_predictions_multiclass_classification(
         seed, min_samples_leaf, n_samples, max_leaf_nodes):
     # Same as test_same_predictions_regression but for classification
 
-    lb = pytest.importorskip("lightgbm")
-
     rng = np.random.RandomState(seed=seed)
     n_samples = n_samples
     max_iter = 1
     max_bins = 256
+    lr = 1
 
     X, y = make_classification(n_samples=n_samples, n_classes=3, n_features=5,
                                n_informative=5, n_redundant=0,
@@ -164,18 +152,6 @@ def test_same_predictions_multiclass_classification(
 
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
 
-    # LightGBM multiplies the hessians by 2 so we need to double the learning
-    # rate. We could also do that for min_hessian_to_split.
-    lr = 1
-    lr_lightgbm = lr * 2
-
-    est_lightgbm = lb.LGBMClassifier(objective='multiclass',
-                                     n_estimators=max_iter,
-                                     min_data_in_bin=1,
-                                     max_bin=max_bins,
-                                     learning_rate=lr_lightgbm,
-                                     min_data_in_leaf=min_samples_leaf,
-                                     num_leaves=max_leaf_nodes)
     est_pygbm = GradientBoostingClassifier(loss='categorical_crossentropy',
                                            max_iter=max_iter,
                                            max_bins=max_bins,
@@ -184,6 +160,7 @@ def test_same_predictions_multiclass_classification(
                                            scoring=None,
                                            min_samples_leaf=min_samples_leaf,
                                            max_leaf_nodes=max_leaf_nodes)
+    est_lightgbm = get_lightgbm_estimator(est_pygbm)
 
     est_lightgbm.fit(X_train, y_train)
     est_pygbm.fit(X_train, y_train)