Skip to content

Commit

Permalink
[dask][python-package] include support for column array as label (#3943)
Browse files Browse the repository at this point in the history
* include support for column array as label

* remove nested ifs

* fix linting errors

* include tests for sklearn regressors

* include docstring for numpy_1d_array_to_dtype

* include . at end of docstring

* remove pandas import and test for regression, classification and ranking

* check predictions of sklearn models as well

* test training only in dask. drop pandas series tests

* use PANDAS_INSTALLED and pd_Series

* inline imports

* use col array in fit for test_dask

* include review comments
  • Loading branch information
jmoralez committed Feb 24, 2021
1 parent 86a085f commit 5dacd60
Show file tree
Hide file tree
Showing 4 changed files with 122 additions and 5 deletions.
24 changes: 20 additions & 4 deletions python-package/lightgbm/basic.py
Expand Up @@ -126,6 +126,21 @@ def is_numpy_1d_array(data):
return isinstance(data, np.ndarray) and len(data.shape) == 1


def is_numpy_column_array(data):
"""Check whether data is a column numpy array."""
if not isinstance(data, np.ndarray):
return False
shape = data.shape
return len(shape) == 2 and shape[1] == 1


def cast_numpy_1d_array_to_dtype(array, dtype):
"""Cast numpy 1d array to given dtype."""
if array.dtype == dtype:
return array
return array.astype(dtype=dtype, copy=False)


def is_1d_list(data):
"""Check whether data is a 1-D list."""
return isinstance(data, list) and (not data or is_numeric(data[0]))
Expand All @@ -134,10 +149,11 @@ def is_1d_list(data):
def list_to_1d_numpy(data, dtype=np.float32, name='list'):
"""Convert data to numpy 1-D array."""
if is_numpy_1d_array(data):
if data.dtype == dtype:
return data
else:
return data.astype(dtype=dtype, copy=False)
return cast_numpy_1d_array_to_dtype(data, dtype)
elif is_numpy_column_array(data):
_log_warning('Converting column-vector to 1d array')
array = data.ravel()
return cast_numpy_1d_array_to_dtype(array, dtype)
elif is_1d_list(data):
return np.array(data, dtype=dtype, copy=False)
elif isinstance(data, pd_Series):
Expand Down
31 changes: 31 additions & 0 deletions tests/python_package_test/test_basic.py
Expand Up @@ -8,6 +8,7 @@
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from lightgbm.compat import PANDAS_INSTALLED, pd_Series

from .utils import load_breast_cancer

Expand Down Expand Up @@ -375,3 +376,33 @@ def test_choose_param_value():
"num_trees": 81
}
assert original_params == expected_params


@pytest.mark.skipif(not PANDAS_INSTALLED, reason='pandas is not installed')
@pytest.mark.parametrize(
'y',
[
np.random.rand(10),
np.random.rand(10, 1),
pd_Series(np.random.rand(10)),
pd_Series(['a', 'b']),
[1] * 10,
[[1], [2]]
])
@pytest.mark.parametrize('dtype', [np.float32, np.float64])
def test_list_to_1d_numpy(y, dtype):
if isinstance(y, np.ndarray) and len(y.shape) == 2:
with pytest.warns(UserWarning, match='column-vector'):
lgb.basic.list_to_1d_numpy(y)
return
elif isinstance(y, list) and isinstance(y[0], list):
with pytest.raises(TypeError):
lgb.basic.list_to_1d_numpy(y)
return
elif isinstance(y, pd_Series) and y.dtype == object:
with pytest.raises(ValueError):
lgb.basic.list_to_1d_numpy(y)
return
result = lgb.basic.list_to_1d_numpy(y, dtype=dtype)
assert result.size == 10
assert result.dtype == dtype
37 changes: 37 additions & 0 deletions tests/python_package_test/test_dask.py
Expand Up @@ -1198,6 +1198,43 @@ def test_dask_methods_and_sklearn_equivalents_have_similar_signatures(methods):
assert dask_params[param].default == sklearn_params[param].default, error_msg


@pytest.mark.parametrize('task', tasks)
def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(
task,
client,
):
if task == 'ranking':
_, _, _, _, dX, dy, dw, dg = _create_ranking_data(
output='dataframe',
group=None
)
model_factory = lgb.DaskLGBMRanker
else:
_, _, _, dX, dy, dw = _create_data(
objective=task,
output='dataframe',
)
dg = None
if task == 'classification':
model_factory = lgb.DaskLGBMClassifier
elif task == 'regression':
model_factory = lgb.DaskLGBMRegressor
dy = dy.to_dask_array(lengths=True)
dy_col_array = dy.reshape(-1, 1)
assert len(dy_col_array.shape) == 2 and dy_col_array.shape[1] == 1

params = {
'n_estimators': 1,
'num_leaves': 3,
'random_state': 0,
'time_out': 5
}
model = model_factory(**params)
model.fit(dX, dy_col_array, sample_weight=dw, group=dg)
assert model.fitted_
client.close(timeout=CLIENT_CLOSE_TIMEOUT)


def sklearn_checks_to_run():
check_names = [
"check_estimator_get_tags_default_keys",
Expand Down
35 changes: 34 additions & 1 deletion tests/python_package_test/test_sklearn.py
Expand Up @@ -18,7 +18,7 @@

import lightgbm as lgb

from .utils import load_boston, load_breast_cancer, load_digits, load_iris, load_linnerud
from .utils import load_boston, load_breast_cancer, load_digits, load_iris, load_linnerud, make_ranking

sk_version = parse_version(sk_version)
if sk_version < parse_version("0.23"):
Expand Down Expand Up @@ -1192,3 +1192,36 @@ def test_parameters_default_constructible(estimator):
name, Estimator = estimator.__class__.__name__, estimator.__class__
# Test that estimators are default-constructible
check_parameters_default_constructible(name, Estimator)


@pytest.mark.parametrize('task', ['classification', 'ranking', 'regression'])
def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task):
pd = pytest.importorskip("pandas")
if task == 'ranking':
X, y, g = make_ranking()
g = np.bincount(g)
model_factory = lgb.LGBMRanker
elif task == 'classification':
X, y = load_iris(return_X_y=True)
model_factory = lgb.LGBMClassifier
elif task == 'regression':
X, y = load_boston(return_X_y=True)
model_factory = lgb.LGBMRegressor
X = pd.DataFrame(X)
y_col_array = y.reshape(-1, 1)
params = {
'n_estimators': 1,
'num_leaves': 3,
'random_state': 0
}
with pytest.warns(UserWarning, match='column-vector'):
if task == 'ranking':
model_1d = model_factory(**params).fit(X, y, group=g)
model_2d = model_factory(**params).fit(X, y_col_array, group=g)
else:
model_1d = model_factory(**params).fit(X, y)
model_2d = model_factory(**params).fit(X, y_col_array)

preds_1d = model_1d.predict(X)
preds_2d = model_2d.predict(X)
np.testing.assert_array_equal(preds_1d, preds_2d)

0 comments on commit 5dacd60

Please sign in to comment.