Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
…#1382) * Note the relationship between `binary` and `xentropy` in the docs and provide an example that compares them * Pass pylint and fix docs formatting. "pylint: disable = no-name-in-module" because pylint does not believe scipy.special contains expit * pass pycodestyle * pass pycodestyle with correct flags; document example in readme
- Loading branch information
Showing
3 changed files
with
114 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
# pylint: disable = no-name-in-module | ||
''' | ||
BLUF: The `xentropy` objective does logistic regression and generalizes | ||
to the case where labels are probabilistic (i.e. numbers between 0 and 1). | ||
Details: Both `binary` and `xentropy` minimize the log loss and use | ||
`boost_from_average = TRUE` by default. Possibly the only difference | ||
between them with default settings is that `binary` may achieve a slight | ||
speed improvement by assuming that the labels are binary instead of | ||
probabilistic. | ||
''' | ||
|
||
import time | ||
|
||
import lightgbm as lgb | ||
import numpy as np | ||
import pandas as pd | ||
from scipy.special import expit | ||
|
||
################# | ||
# Simulate some binary data with a single categorical and | ||
# single continuous predictor | ||
np.random.seed(0) | ||
N = 1000 | ||
X = pd.DataFrame({ | ||
'continuous': range(N), | ||
'categorical': np.repeat([0, 1, 2, 3, 4], N / 5) | ||
}) | ||
CATEGORICAL_EFFECTS = [-1, -1, -2, -2, 2] | ||
LINEAR_TERM = np.array([ | ||
-0.5 + 0.01 * X['continuous'][k] | ||
+ CATEGORICAL_EFFECTS[X['categorical'][k]] for k in range(X.shape[0]) | ||
]) + np.random.normal(0, 1, X.shape[0]) | ||
TRUE_PROB = expit(LINEAR_TERM) | ||
Y = np.random.binomial(1, TRUE_PROB, size=N) | ||
DATA = { | ||
'X': X, | ||
'probability_labels': TRUE_PROB, | ||
'binary_labels': Y, | ||
'lgb_with_binary_labels': lgb.Dataset(X, Y), | ||
'lgb_with_probability_labels': lgb.Dataset(X, TRUE_PROB), | ||
} | ||
|
||
|
||
################# | ||
# Set up a couple of utilities for our experiments | ||
def log_loss(preds, labels): | ||
''' logarithmic loss with non-necessarily-binary labels ''' | ||
log_likelihood = np.sum(labels * np.log(preds)) / len(preds) | ||
return -log_likelihood | ||
|
||
|
||
def experiment(objective, label_type, data): | ||
''' | ||
Measure performance of an objective | ||
:param objective: (str) 'binary' or 'xentropy' | ||
:param label_type: (str) 'binary' or 'probability' | ||
:param data: DATA | ||
:return: dict with experiment summary stats | ||
''' | ||
np.random.seed(0) | ||
nrounds = 5 | ||
lgb_data = data['lgb_with_' + label_type + '_labels'] | ||
params = { | ||
'objective': objective, | ||
'feature_fraction': 1, | ||
'bagging_fraction': 1, | ||
'verbose': -1 | ||
} | ||
time_zero = time.time() | ||
gbm = lgb.train(params, lgb_data, num_boost_round=nrounds) | ||
y_fitted = gbm.predict(data['X']) | ||
y_true = data[label_type + '_labels'] | ||
duration = time.time() - time_zero | ||
return { | ||
'time': duration, | ||
'correlation': np.corrcoef(y_fitted, y_true)[0, 1], | ||
'logloss': log_loss(y_fitted, y_true) | ||
} | ||
|
||
|
||
################# | ||
# Observe the behavior of `binary` and `xentropy` objectives | ||
print('Performance of `binary` objective with binary labels:') | ||
print(experiment('binary', label_type='binary', data=DATA)) | ||
|
||
print('Performance of `xentropy` objective with binary labels:') | ||
print(experiment('xentropy', label_type='binary', data=DATA)) | ||
|
||
print('Performance of `xentropy` objective with probability labels:') | ||
print(experiment('xentropy', label_type='probability', data=DATA)) | ||
|
||
# Trying this throws an error on non-binary values of y: | ||
# experiment('binary', label_type='probability', DATA) | ||
|
||
# The speed of `binary` is not drastically different than | ||
# `xentropy`. `xentropy` runs faster than `binary` in many cases, although | ||
# there are reasons to suspect that `binary` should run faster when the | ||
# label is an integer instead of a float | ||
K = 10 | ||
A = [experiment('binary', label_type='binary', data=DATA)['time'] | ||
for k in range(K)] | ||
B = [experiment('xentropy', label_type='binary', data=DATA)['time'] | ||
for k in range(K)] | ||
print('Best `binary` time: ' + str(min(A))) | ||
print('Best `xentropy` time: ' + str(min(B))) |