Skip to content

Commit

Permalink
[python] added possibility to use sklearn splitter classes in cv func…
Browse files Browse the repository at this point in the history
…tion (#1685)

* added sklearn splitter classes in cv function

* added tests
  • Loading branch information
StrikerRUS authored and guolinke committed Sep 22, 2018
1 parent eb131ea commit 577a03c
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 19 deletions.
20 changes: 16 additions & 4 deletions python-package/lightgbm/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,8 +261,17 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi
full_data = full_data.construct()
num_data = full_data.num_data()
if folds is not None:
if not hasattr(folds, '__iter__'):
raise AttributeError("folds should be a generator or iterator of (train_idx, test_idx)")
if not hasattr(folds, '__iter__') and not hasattr(folds, 'split'):
raise AttributeError("folds should be a generator or iterator of (train_idx, test_idx) tuples "
"or scikit-learn splitter object with split method")
if hasattr(folds, 'split'):
group_info = full_data.get_group()
if group_info is not None:
group_info = group_info.astype(int)
flatted_group = np.repeat(range_(len(group_info)), repeats=group_info)
else:
flatted_group = np.zeros(num_data, dtype=int)
folds = folds.split(X=np.zeros(num_data), y=full_data.get_label(), groups=flatted_group)
else:
if 'objective' in params and params['objective'] == 'lambdarank':
if not SKLEARN_INSTALLED:
Expand Down Expand Up @@ -332,8 +341,11 @@ def cv(params, train_set, num_boost_round=100,
Data to be trained on.
num_boost_round : int, optional (default=100)
Number of boosting iterations.
folds : a generator or iterator of (train_idx, test_idx) tuples or None, optional (default=None)
The train and test indices for the each fold.
folds : a generator or iterator of (train_idx, test_idx) tuples, scikit-learn splitter object or None, optional (default=None)
If generator or iterator, it should yield the train and test indices for the each fold.
If object, it should be one of the scikit-learn splitter classes
(http://scikit-learn.org/stable/modules/classes.html#splitter-classes)
and have ``split`` method.
This argument has highest priority over other data split arguments.
nfold : int, optional (default=5)
Number of folds in CV.
Expand Down
48 changes: 33 additions & 15 deletions tests/python_package_test/test_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from sklearn.datasets import (load_boston, load_breast_cancer, load_digits,
load_iris, load_svmlight_file)
from sklearn.metrics import log_loss, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GroupKFold
from scipy.sparse import csr_matrix

try:
Expand Down Expand Up @@ -438,29 +438,47 @@ def test_cv(self):
lgb_train = lgb.Dataset(X_train, y_train)
# shuffle = False, override metric in params
params_with_metric = {'metric': 'l2', 'verbose': -1}
lgb.cv(params_with_metric, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=False,
metrics='l1', verbose_eval=False)
cv_res = lgb.cv(params_with_metric, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=False,
metrics='l1', verbose_eval=False)
self.assertIn('l1-mean', cv_res)
self.assertNotIn('l2-mean', cv_res)
self.assertEqual(len(cv_res['l1-mean']), 10)
# shuffle = True, callbacks
lgb.cv(params, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=True,
metrics='l1', verbose_eval=False,
callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)])
cv_res = lgb.cv(params, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=True,
metrics='l1', verbose_eval=False,
callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)])
self.assertIn('l1-mean', cv_res)
self.assertEqual(len(cv_res['l1-mean']), 10)
# self defined folds
tss = TimeSeriesSplit(3)
folds = tss.split(X_train)
lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=folds, stratified=False, verbose_eval=False)
cv_res_gen = lgb.cv(params, lgb_train, num_boost_round=10, folds=folds,
metrics='l2', verbose_eval=False)
cv_res_obj = lgb.cv(params, lgb_train, num_boost_round=10, folds=tss,
metrics='l2', verbose_eval=False)
np.testing.assert_almost_equal(cv_res_gen['l2-mean'], cv_res_obj['l2-mean'])
# lambdarank
X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train'))
q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query'))
X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/lambdarank/rank.train'))
q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)),
'../../examples/lambdarank/rank.train.query'))
params_lambdarank = {'objective': 'lambdarank', 'verbose': -1, 'eval_at': 3}
lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
# ... with NDCG (default) metric
cv_res = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3, stratified=False, verbose_eval=False)
self.assertEqual(len(cv_res), 2)
self.assertFalse(np.isnan(cv_res['ndcg@3-mean']).any())
cv_res_lambda = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3,
verbose_eval=False)
self.assertEqual(len(cv_res_lambda), 2)
self.assertFalse(np.isnan(cv_res_lambda['ndcg@3-mean']).any())
# ... with l2 metric
cv_res = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3, stratified=False, metrics='l2', verbose_eval=False)
self.assertEqual(len(cv_res), 2)
self.assertFalse(np.isnan(cv_res['l2-mean']).any())
cv_res_lambda = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3,
metrics='l2', verbose_eval=False)
self.assertEqual(len(cv_res_lambda), 2)
self.assertFalse(np.isnan(cv_res_lambda['l2-mean']).any())
# self defined folds with lambdarank
cv_res_lambda_obj = lgb.cv(params_lambdarank, lgb_train, num_boost_round=10,
folds=GroupKFold(n_splits=3),
metrics='l2', verbose_eval=False)
np.testing.assert_almost_equal(cv_res_lambda['l2-mean'], cv_res_lambda_obj['l2-mean'])

def test_feature_name(self):
X, y = load_boston(True)
Expand Down

0 comments on commit 577a03c

Please sign in to comment.