From bdd30e43d1b338f55cd6189b00e6b6ec8f632ee2 Mon Sep 17 00:00:00 2001 From: Zach Kurtz Date: Mon, 21 May 2018 23:07:22 -0400 Subject: [PATCH] [docs][examples] Clarify relationship between `xentropy` and `binary` (#1382) * Note the relationship between `binary` and `xentropy` in the docs and provide an example that compares them * Pass pylint and fix docs formatting. "pylint: disable = no-name-in-module" because pylint does not believe scipy.special contains expit * pass pycodestyle * pass pycodestyle with correct flags; document example in readme --- docs/Parameters.rst | 2 +- examples/python-guide/README.md | 9 +- examples/python-guide/logistic_regression.py | 106 +++++++++++++++++++ 3 files changed, 114 insertions(+), 3 deletions(-) create mode 100644 examples/python-guide/logistic_regression.py diff --git a/docs/Parameters.rst b/docs/Parameters.rst index be7603769fa7..e02509f27839 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -69,7 +69,7 @@ Core Parameters - ``tweedie``, Tweedie regression with log-link. It might be useful, e.g., for modeling total loss in insurance, or for any target that might be `tweedie-distributed`_ - - ``binary``, binary `log loss`_ classification application + - ``binary``, binary `log loss`_ classification (or logistic regression). Requires labels in {0, 1}; see ``xentropy`` for general probability labels in [0, 1] - multi-class classification application diff --git a/examples/python-guide/README.md b/examples/python-guide/README.md index eefb9bfed105..1fa01a59434a 100644 --- a/examples/python-guide/README.md +++ b/examples/python-guide/README.md @@ -5,10 +5,10 @@ Here is an example for LightGBM to use Python-package. You should install LightGBM [Python-package](https://github.com/Microsoft/LightGBM/tree/master/python-package) first. -You also need scikit-learn, pandas and matplotlib (only for plot example) to run the examples, but they are not required for the package itself. You can install them with pip: +You also need scikit-learn, pandas, matplotlib (only for plot example), and scipy (only for logistic regression example) to run the examples, but they are not required for the package itself. You can install them with pip: ``` -pip install scikit-learn pandas matplotlib -U +pip install scikit-learn pandas matplotlib scipy -U ``` Now you can run examples in this folder, for example: @@ -41,3 +41,8 @@ Examples include: - Self-defined objective function - Self-defined eval metric - Callback function +- [logistic_regression.py](https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/logistic_regression.py) + - Use objective `xentropy` or `binary` + - Use `xentropy` with binary labels or probability labels + - Use `binary` only with binary labels + - Compare speed of `xentropy` versus `binary` \ No newline at end of file diff --git a/examples/python-guide/logistic_regression.py b/examples/python-guide/logistic_regression.py new file mode 100644 index 000000000000..79e365beafdd --- /dev/null +++ b/examples/python-guide/logistic_regression.py @@ -0,0 +1,106 @@ +# pylint: disable = no-name-in-module +''' +BLUF: The `xentropy` objective does logistic regression and generalizes +to the case where labels are probabilistic (i.e. numbers between 0 and 1). + +Details: Both `binary` and `xentropy` minimize the log loss and use +`boost_from_average = TRUE` by default. Possibly the only difference +between them with default settings is that `binary` may achieve a slight +speed improvement by assuming that the labels are binary instead of +probabilistic. +''' + +import time + +import lightgbm as lgb +import numpy as np +import pandas as pd +from scipy.special import expit + +################# +# Simulate some binary data with a single categorical and +# single continuous predictor +np.random.seed(0) +N = 1000 +X = pd.DataFrame({ + 'continuous': range(N), + 'categorical': np.repeat([0, 1, 2, 3, 4], N / 5) +}) +CATEGORICAL_EFFECTS = [-1, -1, -2, -2, 2] +LINEAR_TERM = np.array([ + -0.5 + 0.01 * X['continuous'][k] + + CATEGORICAL_EFFECTS[X['categorical'][k]] for k in range(X.shape[0]) +]) + np.random.normal(0, 1, X.shape[0]) +TRUE_PROB = expit(LINEAR_TERM) +Y = np.random.binomial(1, TRUE_PROB, size=N) +DATA = { + 'X': X, + 'probability_labels': TRUE_PROB, + 'binary_labels': Y, + 'lgb_with_binary_labels': lgb.Dataset(X, Y), + 'lgb_with_probability_labels': lgb.Dataset(X, TRUE_PROB), +} + + +################# +# Set up a couple of utilities for our experiments +def log_loss(preds, labels): + ''' logarithmic loss with non-necessarily-binary labels ''' + log_likelihood = np.sum(labels * np.log(preds)) / len(preds) + return -log_likelihood + + +def experiment(objective, label_type, data): + ''' + Measure performance of an objective + :param objective: (str) 'binary' or 'xentropy' + :param label_type: (str) 'binary' or 'probability' + :param data: DATA + :return: dict with experiment summary stats + ''' + np.random.seed(0) + nrounds = 5 + lgb_data = data['lgb_with_' + label_type + '_labels'] + params = { + 'objective': objective, + 'feature_fraction': 1, + 'bagging_fraction': 1, + 'verbose': -1 + } + time_zero = time.time() + gbm = lgb.train(params, lgb_data, num_boost_round=nrounds) + y_fitted = gbm.predict(data['X']) + y_true = data[label_type + '_labels'] + duration = time.time() - time_zero + return { + 'time': duration, + 'correlation': np.corrcoef(y_fitted, y_true)[0, 1], + 'logloss': log_loss(y_fitted, y_true) + } + + +################# +# Observe the behavior of `binary` and `xentropy` objectives +print('Performance of `binary` objective with binary labels:') +print(experiment('binary', label_type='binary', data=DATA)) + +print('Performance of `xentropy` objective with binary labels:') +print(experiment('xentropy', label_type='binary', data=DATA)) + +print('Performance of `xentropy` objective with probability labels:') +print(experiment('xentropy', label_type='probability', data=DATA)) + +# Trying this throws an error on non-binary values of y: +# experiment('binary', label_type='probability', DATA) + +# The speed of `binary` is not drastically different than +# `xentropy`. `xentropy` runs faster than `binary` in many cases, although +# there are reasons to suspect that `binary` should run faster when the +# label is an integer instead of a float +K = 10 +A = [experiment('binary', label_type='binary', data=DATA)['time'] + for k in range(K)] +B = [experiment('xentropy', label_type='binary', data=DATA)['time'] + for k in range(K)] +print('Best `binary` time: ' + str(min(A))) +print('Best `xentropy` time: ' + str(min(B)))