Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[python-package] fix access to Dataset metadata in scikit-learn custom metrics and objectives #6108

Merged
merged 20 commits into from Nov 7, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
49 changes: 32 additions & 17 deletions python-package/lightgbm/basic.py
Expand Up @@ -368,31 +368,31 @@ def _data_to_2d_numpy(
"It should be list of lists, numpy 2-D array or pandas DataFrame")


def _cfloat32_array_to_numpy(cptr: "ctypes._Pointer", length: int) -> np.ndarray:
def _cfloat32_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarray:
Copy link
Collaborator Author

@jameslamb jameslamb Sep 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This ensures that this function is only called with keyword arguments.

That syntax has been available since Python 3.0: https://peps.python.org/pep-3102/

Doing that eliminates the possibility of accidentally-passed-arguments-in-the-wrong-order types of bugs, and (in my opinion) makes the code a bit easier to read.

"""Convert a ctypes float pointer array to a numpy array."""
if isinstance(cptr, ctypes.POINTER(ctypes.c_float)):
return np.ctypeslib.as_array(cptr, shape=(length,)).copy()
else:
raise RuntimeError('Expected float pointer')


def _cfloat64_array_to_numpy(cptr: "ctypes._Pointer", length: int) -> np.ndarray:
def _cfloat64_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarray:
"""Convert a ctypes double pointer array to a numpy array."""
if isinstance(cptr, ctypes.POINTER(ctypes.c_double)):
return np.ctypeslib.as_array(cptr, shape=(length,)).copy()
else:
raise RuntimeError('Expected double pointer')


def _cint32_array_to_numpy(cptr: "ctypes._Pointer", length: int) -> np.ndarray:
def _cint32_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarray:
"""Convert a ctypes int pointer array to a numpy array."""
if isinstance(cptr, ctypes.POINTER(ctypes.c_int32)):
return np.ctypeslib.as_array(cptr, shape=(length,)).copy()
else:
raise RuntimeError('Expected int32 pointer')


def _cint64_array_to_numpy(cptr: "ctypes._Pointer", length: int) -> np.ndarray:
def _cint64_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarray:
"""Convert a ctypes int pointer array to a numpy array."""
if isinstance(cptr, ctypes.POINTER(ctypes.c_int64)):
return np.ctypeslib.as_array(cptr, shape=(length,)).copy()
Expand Down Expand Up @@ -1229,18 +1229,18 @@ def __create_sparse_native(
data_indices_len = out_shape[0]
indptr_len = out_shape[1]
if indptr_type == _C_API_DTYPE_INT32:
out_indptr = _cint32_array_to_numpy(out_ptr_indptr, indptr_len)
out_indptr = _cint32_array_to_numpy(cptr=out_ptr_indptr, length=indptr_len)
elif indptr_type == _C_API_DTYPE_INT64:
out_indptr = _cint64_array_to_numpy(out_ptr_indptr, indptr_len)
out_indptr = _cint64_array_to_numpy(cptr=out_ptr_indptr, length=indptr_len)
else:
raise TypeError("Expected int32 or int64 type for indptr")
if data_type == _C_API_DTYPE_FLOAT32:
out_data = _cfloat32_array_to_numpy(out_ptr_data, data_indices_len)
out_data = _cfloat32_array_to_numpy(cptr=out_ptr_data, length=data_indices_len)
elif data_type == _C_API_DTYPE_FLOAT64:
out_data = _cfloat64_array_to_numpy(out_ptr_data, data_indices_len)
out_data = _cfloat64_array_to_numpy(cptr=out_ptr_data, length=data_indices_len)
else:
raise TypeError("Expected float32 or float64 type for data")
out_indices = _cint32_array_to_numpy(out_ptr_indices, data_indices_len)
out_indices = _cint32_array_to_numpy(cptr=out_ptr_indices, length=data_indices_len)
# break up indptr based on number of rows (note more than one matrix in multiclass case)
per_class_indptr_shape = cs.indptr.shape[0]
# for CSC there is extra column added
Expand Down Expand Up @@ -2504,6 +2504,12 @@ def set_field(
def get_field(self, field_name: str) -> Optional[np.ndarray]:
"""Get property from the Dataset.

Can only be run on a constructed Dataset.

Unlike ``get_group()``, ``get_init_score()``, ``get_label()``, ``get_position()``, and ``get_weight()``,
this method ignores any raw data passed into ``lgb.Dataset()`` on the Python side, and will only read
data from the constructed C++ ``Dataset`` object.

Parameters
----------
field_name : str
Expand All @@ -2530,11 +2536,20 @@ def get_field(self, field_name: str) -> Optional[np.ndarray]:
if tmp_out_len.value == 0:
return None
if out_type.value == _C_API_DTYPE_INT32:
arr = _cint32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), tmp_out_len.value)
arr = _cint32_array_to_numpy(
cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)),
length=tmp_out_len.value
)
elif out_type.value == _C_API_DTYPE_FLOAT32:
arr = _cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value)
arr = _cfloat32_array_to_numpy(
cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)),
length=tmp_out_len.value
)
elif out_type.value == _C_API_DTYPE_FLOAT64:
arr = _cfloat64_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)), tmp_out_len.value)
arr = _cfloat64_array_to_numpy(
cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)),
length=tmp_out_len.value
)
else:
raise TypeError("Unknown type")
if field_name == 'init_score':
Expand Down Expand Up @@ -2834,7 +2849,7 @@ def get_feature_name(self) -> List[str]:
ptr_string_buffers))
return [string_buffers[i].value.decode('utf-8') for i in range(num_feature)]

def get_label(self) -> Optional[np.ndarray]:
def get_label(self) -> Optional[_LGBM_LabelType]:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These issues in the Dataset.get_{metadata}() methods were the direct cause of the mypy errors mentioned in the description.

Here's an example.

import lightgbm as lgb
import numpy as np

X = np.array([[1.0, 2.0], [3.0, 4.0]])
dtrain = lgb.Dataset(
    data=np.array([[1.0, 2.0], [3.0, 4.0]]),
    label=[1, 2],
    params={
        "min_data_in_bin": 1,
        "min_data_in_leaf": 1,
    },
)

# 'label' was passed in as a list, get_label() returns that list
type(dtrain.get_label())
# <class 'list'>

# after construction, this is pulled from the C++ side, and is a numpy array
dtrain.construct()
type(dtrain.get_label())
# <class 'numpy.ndarray'>

"""Get the label of the Dataset.

Returns
Expand All @@ -2846,7 +2861,7 @@ def get_label(self) -> Optional[np.ndarray]:
self.label = self.get_field('label')
return self.label

def get_weight(self) -> Optional[np.ndarray]:
def get_weight(self) -> Optional[_LGBM_WeightType]:
"""Get the weight of the Dataset.

Returns
Expand All @@ -2858,7 +2873,7 @@ def get_weight(self) -> Optional[np.ndarray]:
self.weight = self.get_field('weight')
return self.weight

def get_init_score(self) -> Optional[np.ndarray]:
def get_init_score(self) -> Optional[_LGBM_InitScoreType]:
"""Get the initial score of the Dataset.

Returns
Expand Down Expand Up @@ -2902,7 +2917,7 @@ def get_data(self) -> Optional[_LGBM_TrainDataType]:
"set free_raw_data=False when construct Dataset to avoid this.")
return self.data

def get_group(self) -> Optional[np.ndarray]:
def get_group(self) -> Optional[_LGBM_GroupType]:
"""Get the group of the Dataset.

Returns
Expand All @@ -2921,7 +2936,7 @@ def get_group(self) -> Optional[np.ndarray]:
self.group = np.diff(self.group)
return self.group

def get_position(self) -> Optional[np.ndarray]:
def get_position(self) -> Optional[_LGBM_PositionType]:
"""Get the position of the Dataset.

Returns
Expand Down
20 changes: 14 additions & 6 deletions python-package/lightgbm/sklearn.py
Expand Up @@ -151,14 +151,18 @@ def __call__(self, preds: np.ndarray, dataset: Dataset) -> Tuple[np.ndarray, np.
The value of the second order derivative (Hessian) of the loss
with respect to the elements of preds for each sample point.
"""
labels = dataset.get_label()
labels = dataset.get_field("label")
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After correcting the type hints in Dataset methods, mypy rightly started complaining about these parts of the sklearn interface, where custom objective functions and metric functions are called.

With many instances like this:

sklearn.py:157: error: Argument 1 has incompatible type "Union[List[float], List[int], ndarray[Any, Any], Any, None]"; expected "Optional[ndarray[Any, Any]]"  [arg-type]

That's because we say in documentation and type hints that sklearn custom objective functions and eval metric functions should only expect to be passed numpy arrays or None.

_LGBM_ScikitCustomObjectiveFunction = Union[
# f(labels, preds)
Callable[
[Optional[np.ndarray], np.ndarray],
Tuple[np.ndarray, np.ndarray]
],
# f(labels, preds, weights)
Callable[
[Optional[np.ndarray], np.ndarray, Optional[np.ndarray]],
Tuple[np.ndarray, np.ndarray]
],
# f(labels, preds, weights, group)
Callable[
[Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]],
Tuple[np.ndarray, np.ndarray]
],
]

Expects a callable with following signatures:
``func(y_true, y_pred)``,
``func(y_true, y_pred, weight)``
or ``func(y_true, y_pred, weight, group)``
and returns (grad, hess):
y_true : numpy 1-D array of shape = [n_samples]
The target values.
y_pred : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task)
The predicted values.
Predicted values are returned before any transformation,
e.g. they are raw margin instead of probability of positive class for binary task.
weight : numpy 1-D array of shape = [n_samples]
The weight of samples. Weights should be non-negative.
group : numpy 1-D array

This PR proposes fixing that by switching from get_label() to get_field("label") (and similar for all the other metadata stuff). Custom metrics and eval functions in the sklearn interface should only ever be called on already-constructed Dataset objects, so that should be safe. It should also reduce the risk of soem mistake accidentally leading to something other than numpy arrays being passed into sklearn custom objective and metric functions.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you looked into the performance impact of this? This method is called on each iteration and we're currently just returning an attribute from the dataset but this would require making a call to C++ to get the field each time. I think it just returns the pointer so it may not be that expensive but it's something to consider given that we'd be making several calls to get all the fields that this method needs.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, I didn't consider that this would be a performance issue. You're right, it could be!

I think it just returns the pointer

Nah, your instincts that this might be more expensive were good! This call to LGBM_DatasetGetField() ...

_safe_call(_LIB.LGBM_DatasetGetField(
self._handle,
_c_str(field_name),
ctypes.byref(tmp_out_len),
ctypes.byref(ret),
ctypes.byref(out_type)))

... is just populating a pointer ...

if (dataset->GetFloatField(field_name, out_len, reinterpret_cast<const float**>(out_ptr))) {

*out_ptr = metadata_.label();

...but then that get materialized into a new numpy array (e.g. a full copy is made in memory):

if out_type.value == _C_API_DTYPE_INT32:
arr = _cint32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), tmp_out_len.value)
elif out_type.value == _C_API_DTYPE_FLOAT32:
arr = _cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value)
elif out_type.value == _C_API_DTYPE_FLOAT64:
arr = _cfloat64_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)), tmp_out_len.value)

return np.ctypeslib.as_array(cptr, shape=(length,)).copy()

So I think you're right, calling get_field() unambiguously might not be a good way to do this. That's just wasted effort relative to .get_label(), .get_weight(), etc., since, on a constructed Dataset, .label, .weight, etc. should already contain numpy arrays regardless of what format the raw data was passed in.

Thanks to these calls in Dataset._lazy_init():

if label is not None:
self.set_label(label)
if self.get_label() is None:
raise ValueError("Label should not be None")
if weight is not None:
self.set_weight(weight)
if group is not None:
self.set_group(group)
if position is not None:
self.set_position(position)

So given all that....let me think about whether there's a better way to resolve these errors from mypy. I don't think they're just things we should ignore... I do feel we should add some stronger guarantees here that by the time custom objective and metric functions should only be passed an already-constructed Dataset.

Thanks for bringing it up!

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alright @jmoralez I think I found a better way to do this. I just pushed 017e5e5, with the following changes:

  • switches back to using get_label() and get_weight() instead of get_field()
    • because those will return the data cached on the Dataset object without reconstructing a new numpy array on every iteration
  • keeps get_field("group") for group
    • that'll still introduce a performance penalty (reconstructing an array on every iteration if the objective function / eval metric takes 4 arguments, i.e. is for learning-to-rank)
    • but I think we need to do that... right now the docs say such functions should expect group to be a numpy array, but lightgbm is currently passing them a list
    • group : numpy 1-D array
  • fixes the mypy errors by putting get_label() and get_weight() calls inside functions that use assert isinstance(x, np.ndarray) for type narrowing*
  • adds docs to Dataset.get_{group/init_score/weight/position/label} explaining that the return type will be different for a constructed Dataset

@jmoralez whenever you have time, could you please take another look? Thanks again for bringing this up!

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't we do for group what we do for other fields like init_score and weight where after we set the field we set the Dataset attribute to the result of get_field?

self.set_field('init_score', init_score)
self.init_score = self.get_field('init_score') # original values can be modified at cpp side

self.set_field('weight', weight)
self.weight = self.get_field('weight') # original values can be modified at cpp side

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could but that would be a user-facing breaking change. Look at the test cases in this PR... even after construction, Dataset.get_group() returns whatever was passed in (most commonly a list).

Do you think it's worth the breaking change to get that consistency?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is, it's weird having all of the arguments of a custom objective be arrays except for group. Although the main difference I think is not really the data structure but the format that they're in (group boundaries vs lengths). Given that the docstring says array and it's more consistent with the other fields I'd prefer we override it with get_field. On the other hand I haven't used ranking a lot so I'm not sure which format is more convenient for the objective function.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok I think your reasoning makes sense and that we should change it to an array.

I'm especially since tonight I found that it'll be set to an array of boundaries the first time you call get_group() on a Dataset where the .group attribute is None, which can happen when loading from a binary file:

import lightgbm as lgb
import numpy as np

X = np.array([[1.0, 2.0], [3.0, 4.0]])

dtrain = lgb.Dataset(
    X,
    params={
        "min_data_in_bin": 1,
        "min_data_in_leaf": 1,
        "verbosity": -1
    },
    group=[1, 1],
    label=[1, 2],
)
dtrain.construct()

# get_group() returns a list of sizes
assert dtrain.get_group() == [1, 1]

# get_field() returns a numpy array of bounadries
np.testing.assert_array_equal(
    dtrain.get_field("group"),
    np.array([0, 1, 2])
)


# round-trip to and from binary file
dtrain.save_binary("dtrain.bin")
dtrain2 = lgb.Dataset(
    data="dtrain.bin",
    params={
        "min_data_in_bin": 1,
        "min_data_in_leaf": 1,
        "verbosity": -1
    }
)

# before construction, group is empty
assert dtrain2.group is None

# after construction, get_group() returns a numpy array of sizes
dtrain2.construct()
np.testing.assert_array_equal(
    dtrain.get_group(),
    np.array([1, 1])
)

# ... and get_field() returns a numpy array of boundaries
np.testing.assert_array_equal(
    dtrain.get_field("group"),
    np.array([0, 1, 2])
)

That doesn't matter specifically for the scikit-learn interface (which I don't believe could ever encounter a binary dataset file), but it does give me more confidence that other parts of the code base aren't implicitly relying on Dataset.group being a list.


I just did the following:

  • pushed ada18f8 with these changes
  • changed the label on this PR to breaking
  • changed the title of the PR
  • added a section in the description explaining why this is breaking

Thanks for talking through it with me, I know this is way down in the depths of the library and that reviewing it takes a lot of effort.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I won't merge this until you've had another chance to review @jmoralez . Take your time!

argc = len(signature(self.func).parameters)
if argc == 2:
grad, hess = self.func(labels, preds) # type: ignore[call-arg]
elif argc == 3:
grad, hess = self.func(labels, preds, dataset.get_weight()) # type: ignore[call-arg]
grad, hess = self.func(labels, preds, dataset.get_field("weight")) # type: ignore[call-arg]
elif argc == 4:
grad, hess = self.func(labels, preds, dataset.get_weight(), dataset.get_group()) # type: ignore [call-arg]
group = dataset.get_field("group")
if group is not None:
return self.func(labels, preds, dataset.get_field("weight"), np.diff(group)) # type: ignore[call-arg]
else:
return self.func(labels, preds, dataset.get_field("weight"), group) # type: ignore[call-arg]
jameslamb marked this conversation as resolved.
Show resolved Hide resolved
else:
raise TypeError(f"Self-defined objective function should have 2, 3 or 4 arguments, got {argc}")
return grad, hess
Expand Down Expand Up @@ -229,14 +233,18 @@ def __call__(
is_higher_better : bool
Is eval result higher better, e.g. AUC is ``is_higher_better``.
"""
labels = dataset.get_label()
labels = dataset.get_field("label")
argc = len(signature(self.func).parameters)
if argc == 2:
return self.func(labels, preds) # type: ignore[call-arg]
elif argc == 3:
return self.func(labels, preds, dataset.get_weight()) # type: ignore[call-arg]
return self.func(labels, preds, dataset.get_field("weight")) # type: ignore[call-arg]
elif argc == 4:
return self.func(labels, preds, dataset.get_weight(), dataset.get_group()) # type: ignore[call-arg]
group = dataset.get_field("group")
if group is not None:
return self.func(labels, preds, dataset.get_field("weight"), np.diff(group)) # type: ignore[call-arg]
else:
return self.func(labels, preds, dataset.get_field("weight"), group) # type: ignore[call-arg]
else:
raise TypeError(f"Self-defined eval function should have 2, 3 or 4 arguments, got {argc}")

Expand Down
82 changes: 82 additions & 0 deletions tests/python_package_test/test_basic.py
Expand Up @@ -499,6 +499,88 @@ def check_asserts(data):
check_asserts(lgb_data)


def test_dataset_construction_overwrites_user_provided_metadata_fields():

X = np.array([[1.0, 2.0], [3.0, 4.0]])

dtrain = lgb.Dataset(
X,
params={
"min_data_in_bin": 1,
"min_data_in_leaf": 1,
"verbosity": -1
},
group=[1, 1],
init_score=[0.312, 0.708],
label=[1, 2],
position=np.array([0.0, 1.0], dtype=np.float32),
weight=[0.5, 1.5],
)

# unconstructed, get_* methods should return whatever was provided
assert dtrain.group == [1, 1]
assert dtrain.get_group() == [1, 1]
assert dtrain.init_score == [0.312, 0.708]
assert dtrain.get_init_score() == [0.312, 0.708]
assert dtrain.label == [1, 2]
assert dtrain.get_label() == [1, 2]
np.testing.assert_array_equal(
dtrain.position,
np.array([0.0, 1.0], dtype=np.float32),
strict=True
)
np.testing.assert_array_equal(
dtrain.get_position(),
np.array([0.0, 1.0], dtype=np.float32),
strict=True
)
assert dtrain.weight == [0.5, 1.5]
assert dtrain.get_weight() == [0.5, 1.5]

# before construction, get_field() should raise an exception
for field_name in ["group", "init_score", "label", "position", "weight"]:
with pytest.raises(Exception, match=f"Cannot get {field_name} before construct Dataset"):
dtrain.get_field(field_name)

# constructed, get_* methods should return numpy arrays, even when the provided
# input was a list of floats or ints
dtrain.construct()
expected_group = [1, 1]
assert dtrain.group == expected_group
assert dtrain.get_group() == expected_group
# get_field("group") returns a numpy array with boundaries, instead of size
np.testing.assert_array_equal(
dtrain.get_field("group"),
np.array([0, 1, 2], dtype=np.int32),
strict=True
)

expected_init_score = np.array([0.312, 0.708])
np.testing.assert_array_equal(dtrain.init_score, expected_init_score, strict=True)
np.testing.assert_array_equal(dtrain.get_init_score(), expected_init_score, strict=True)
np.testing.assert_array_equal(dtrain.get_field("init_score"), expected_init_score, strict=True)

expected_label = np.array([1, 2], dtype=np.float32)
np.testing.assert_array_equal(dtrain.label, expected_label, strict=True)
np.testing.assert_array_equal(dtrain.get_label(), expected_label, strict=True)
np.testing.assert_array_equal(dtrain.get_field("label"), expected_label, strict=True)

expected_position = np.array([0.0, 1.0], dtype=np.float32)
np.testing.assert_array_equal(dtrain.position, expected_position, strict=True)
np.testing.assert_array_equal(dtrain.get_position(), expected_position, strict=True)
# NOTE: "position" is converted to int32 on thhe C++ side
np.testing.assert_array_equal(
dtrain.get_field("position"),
np.array([0.0, 1.0], dtype=np.int32),
strict=True
)

expected_weight = np.array([0.5, 1.5], dtype=np.float32)
np.testing.assert_array_equal(dtrain.weight, expected_weight, strict=True)
np.testing.assert_array_equal(dtrain.get_weight(), expected_weight, strict=True)
np.testing.assert_array_equal(dtrain.get_field("weight"), expected_weight, strict=True)


def test_choose_param_value():

original_params = {
Expand Down