microsoft · jameslamb · Nov 7, 2023 · Sep 21, 2023 · Sep 22, 2023 · Sep 22, 2023
@@ -368,31 +368,31 @@ def _data_to_2d_numpy(
                     "It should be list of lists, numpy 2-D array or pandas DataFrame")
 
 
-def _cfloat32_array_to_numpy(cptr: "ctypes._Pointer", length: int) -> np.ndarray:
+def _cfloat32_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarray:
     """Convert a ctypes float pointer array to a numpy array."""
     if isinstance(cptr, ctypes.POINTER(ctypes.c_float)):
         return np.ctypeslib.as_array(cptr, shape=(length,)).copy()
     else:
         raise RuntimeError('Expected float pointer')
 
 
-def _cfloat64_array_to_numpy(cptr: "ctypes._Pointer", length: int) -> np.ndarray:
+def _cfloat64_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarray:
     """Convert a ctypes double pointer array to a numpy array."""
     if isinstance(cptr, ctypes.POINTER(ctypes.c_double)):
         return np.ctypeslib.as_array(cptr, shape=(length,)).copy()
     else:
         raise RuntimeError('Expected double pointer')
 
 
-def _cint32_array_to_numpy(cptr: "ctypes._Pointer", length: int) -> np.ndarray:
+def _cint32_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarray:
     """Convert a ctypes int pointer array to a numpy array."""
     if isinstance(cptr, ctypes.POINTER(ctypes.c_int32)):
         return np.ctypeslib.as_array(cptr, shape=(length,)).copy()
     else:
         raise RuntimeError('Expected int32 pointer')
 
 
-def _cint64_array_to_numpy(cptr: "ctypes._Pointer", length: int) -> np.ndarray:
+def _cint64_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarray:
     """Convert a ctypes int pointer array to a numpy array."""
     if isinstance(cptr, ctypes.POINTER(ctypes.c_int64)):
         return np.ctypeslib.as_array(cptr, shape=(length,)).copy()
@@ -1229,18 +1229,18 @@ def __create_sparse_native(
         data_indices_len = out_shape[0]
         indptr_len = out_shape[1]
         if indptr_type == _C_API_DTYPE_INT32:
-            out_indptr = _cint32_array_to_numpy(out_ptr_indptr, indptr_len)
+            out_indptr = _cint32_array_to_numpy(cptr=out_ptr_indptr, length=indptr_len)
         elif indptr_type == _C_API_DTYPE_INT64:
-            out_indptr = _cint64_array_to_numpy(out_ptr_indptr, indptr_len)
+            out_indptr = _cint64_array_to_numpy(cptr=out_ptr_indptr, length=indptr_len)
         else:
             raise TypeError("Expected int32 or int64 type for indptr")
         if data_type == _C_API_DTYPE_FLOAT32:
-            out_data = _cfloat32_array_to_numpy(out_ptr_data, data_indices_len)
+            out_data = _cfloat32_array_to_numpy(cptr=out_ptr_data, length=data_indices_len)
         elif data_type == _C_API_DTYPE_FLOAT64:
-            out_data = _cfloat64_array_to_numpy(out_ptr_data, data_indices_len)
+            out_data = _cfloat64_array_to_numpy(cptr=out_ptr_data, length=data_indices_len)
         else:
             raise TypeError("Expected float32 or float64 type for data")
-        out_indices = _cint32_array_to_numpy(out_ptr_indices, data_indices_len)
+        out_indices = _cint32_array_to_numpy(cptr=out_ptr_indices, length=data_indices_len)
         # break up indptr based on number of rows (note more than one matrix in multiclass case)
         per_class_indptr_shape = cs.indptr.shape[0]
         # for CSC there is extra column added
@@ -2504,6 +2504,12 @@ def set_field(
     def get_field(self, field_name: str) -> Optional[np.ndarray]:
         """Get property from the Dataset.
 
+        Can only be run on a constructed Dataset.
+
+        Unlike ``get_group()``, ``get_init_score()``, ``get_label()``, ``get_position()``, and ``get_weight()``,
+        this method ignores any raw data passed into ``lgb.Dataset()`` on the Python side, and will only read
+        data from the constructed C++ ``Dataset`` object.
+
         Parameters
         ----------
         field_name : str
@@ -2530,11 +2536,20 @@ def get_field(self, field_name: str) -> Optional[np.ndarray]:
         if tmp_out_len.value == 0:
             return None
         if out_type.value == _C_API_DTYPE_INT32:
-            arr = _cint32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), tmp_out_len.value)
+            arr = _cint32_array_to_numpy(
+                cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)),
+                length=tmp_out_len.value
+            )
         elif out_type.value == _C_API_DTYPE_FLOAT32:
-            arr = _cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value)
+            arr = _cfloat32_array_to_numpy(
+                cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)),
+                length=tmp_out_len.value
+            )
         elif out_type.value == _C_API_DTYPE_FLOAT64:
-            arr = _cfloat64_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)), tmp_out_len.value)
+            arr = _cfloat64_array_to_numpy(
+                cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)),
+                length=tmp_out_len.value
+            )
         else:
             raise TypeError("Unknown type")
         if field_name == 'init_score':
@@ -2834,7 +2849,7 @@ def get_feature_name(self) -> List[str]:
                 ptr_string_buffers))
         return [string_buffers[i].value.decode('utf-8') for i in range(num_feature)]
 
-    def get_label(self) -> Optional[np.ndarray]:
+    def get_label(self) -> Optional[_LGBM_LabelType]:
         """Get the label of the Dataset.
 
         Returns
@@ -2846,7 +2861,7 @@ def get_label(self) -> Optional[np.ndarray]:
             self.label = self.get_field('label')
         return self.label
 
-    def get_weight(self) -> Optional[np.ndarray]:
+    def get_weight(self) -> Optional[_LGBM_WeightType]:
         """Get the weight of the Dataset.
 
         Returns
@@ -2858,7 +2873,7 @@ def get_weight(self) -> Optional[np.ndarray]:
             self.weight = self.get_field('weight')
         return self.weight
 
-    def get_init_score(self) -> Optional[np.ndarray]:
+    def get_init_score(self) -> Optional[_LGBM_InitScoreType]:
         """Get the initial score of the Dataset.
 
         Returns
@@ -2902,7 +2917,7 @@ def get_data(self) -> Optional[_LGBM_TrainDataType]:
                                 "set free_raw_data=False when construct Dataset to avoid this.")
         return self.data
 
-    def get_group(self) -> Optional[np.ndarray]:
+    def get_group(self) -> Optional[_LGBM_GroupType]:
         """Get the group of the Dataset.
 
         Returns
@@ -2921,7 +2936,7 @@ def get_group(self) -> Optional[np.ndarray]:
                 self.group = np.diff(self.group)
         return self.group
 
-    def get_position(self) -> Optional[np.ndarray]:
+    def get_position(self) -> Optional[_LGBM_PositionType]:
         """Get the position of the Dataset.
 
         Returns

@@ -151,14 +151,18 @@ def __call__(self, preds: np.ndarray, dataset: Dataset) -> Tuple[np.ndarray, np.
             The value of the second order derivative (Hessian) of the loss
             with respect to the elements of preds for each sample point.
         """
-        labels = dataset.get_label()
+        labels = dataset.get_field("label")
 _LGBM_ScikitCustomObjectiveFunction = Union[ 
     # f(labels, preds) 
     Callable[ 
         [Optional[np.ndarray], np.ndarray], 
         Tuple[np.ndarray, np.ndarray] 
     ], 
     # f(labels, preds, weights) 
     Callable[ 
         [Optional[np.ndarray], np.ndarray, Optional[np.ndarray]], 
         Tuple[np.ndarray, np.ndarray] 
     ], 
     # f(labels, preds, weights, group) 
     Callable[ 
         [Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]], 
         Tuple[np.ndarray, np.ndarray] 
     ], 
 ] 
             Expects a callable with following signatures: 
             ``func(y_true, y_pred)``, 
             ``func(y_true, y_pred, weight)`` 
             or ``func(y_true, y_pred, weight, group)`` 
             and returns (grad, hess): 
                 y_true : numpy 1-D array of shape = [n_samples] 
                     The target values. 
                 y_pred : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task) 
                     The predicted values. 
                     Predicted values are returned before any transformation, 
                     e.g. they are raw margin instead of probability of positive class for binary task. 
                 weight : numpy 1-D array of shape = [n_samples] 
                     The weight of samples. Weights should be non-negative. 
                 group : numpy 1-D array 
 _safe_call(_LIB.LGBM_DatasetGetField( 
     self._handle, 
     _c_str(field_name), 
     ctypes.byref(tmp_out_len), 
     ctypes.byref(ret), 
     ctypes.byref(out_type))) 
 if (dataset->GetFloatField(field_name, out_len, reinterpret_cast<const float**>(out_ptr))) { 
 *out_ptr = metadata_.label(); 
 if out_type.value == _C_API_DTYPE_INT32: 
     arr = _cint32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), tmp_out_len.value) 
 elif out_type.value == _C_API_DTYPE_FLOAT32: 
     arr = _cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value) 
 elif out_type.value == _C_API_DTYPE_FLOAT64: 
     arr = _cfloat64_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)), tmp_out_len.value) 
 return np.ctypeslib.as_array(cptr, shape=(length,)).copy() 
 if label is not None: 
     self.set_label(label) 
 if self.get_label() is None: 
     raise ValueError("Label should not be None") 
 if weight is not None: 
     self.set_weight(weight) 
 if group is not None: 
     self.set_group(group) 
 if position is not None: 
     self.set_position(position) 
                 group : numpy 1-D array 
 self.set_field('init_score', init_score) 
 self.init_score = self.get_field('init_score')  # original values can be modified at cpp side 
 self.set_field('weight', weight) 
 self.weight = self.get_field('weight')  # original values can be modified at cpp side 
 _LGBM_ScikitCustomObjectiveFunction = Union[ 
     # f(labels, preds) 
     Callable[ 
         [Optional[np.ndarray], np.ndarray], 
         Tuple[np.ndarray, np.ndarray] 
     ], 
     # f(labels, preds, weights) 
     Callable[ 
         [Optional[np.ndarray], np.ndarray, Optional[np.ndarray]], 
         Tuple[np.ndarray, np.ndarray] 
     ], 
     # f(labels, preds, weights, group) 
     Callable[ 
         [Optional[np.ndarray], np.ndarray, Optional[np.ndarray], Optional[np.ndarray]], 
         Tuple[np.ndarray, np.ndarray] 
     ], 
 ] 
             Expects a callable with following signatures: 
             ``func(y_true, y_pred)``, 
             ``func(y_true, y_pred, weight)`` 
             or ``func(y_true, y_pred, weight, group)`` 
             and returns (grad, hess): 
  
                 y_true : numpy 1-D array of shape = [n_samples] 
                     The target values. 
                 y_pred : numpy 1-D array of shape = [n_samples] or numpy 2-D array of shape = [n_samples, n_classes] (for multi-class task) 
                     The predicted values. 
                     Predicted values are returned before any transformation, 
                     e.g. they are raw margin instead of probability of positive class for binary task. 
                 weight : numpy 1-D array of shape = [n_samples] 
                     The weight of samples. Weights should be non-negative. 
                 group : numpy 1-D array 
 _safe_call(_LIB.LGBM_DatasetGetField( 
     self._handle, 
     _c_str(field_name), 
     ctypes.byref(tmp_out_len), 
     ctypes.byref(ret), 
     ctypes.byref(out_type))) 
 if (dataset->GetFloatField(field_name, out_len, reinterpret_cast<const float**>(out_ptr))) { 
 *out_ptr = metadata_.label(); 
 if out_type.value == _C_API_DTYPE_INT32: 
     arr = _cint32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), tmp_out_len.value) 
 elif out_type.value == _C_API_DTYPE_FLOAT32: 
     arr = _cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value) 
 elif out_type.value == _C_API_DTYPE_FLOAT64: 
     arr = _cfloat64_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)), tmp_out_len.value) 
 return np.ctypeslib.as_array(cptr, shape=(length,)).copy() 
 if label is not None: 
     self.set_label(label) 
 if self.get_label() is None: 
     raise ValueError("Label should not be None") 
 if weight is not None: 
     self.set_weight(weight) 
 if group is not None: 
     self.set_group(group) 
 if position is not None: 
     self.set_position(position) 
                 group : numpy 1-D array 
 self.set_field('init_score', init_score) 
 self.init_score = self.get_field('init_score')  # original values can be modified at cpp side 
 self.set_field('weight', weight) 
 self.weight = self.get_field('weight')  # original values can be modified at cpp side 
         argc = len(signature(self.func).parameters)
         if argc == 2:
             grad, hess = self.func(labels, preds)  # type: ignore[call-arg]
         elif argc == 3:
-            grad, hess = self.func(labels, preds, dataset.get_weight())  # type: ignore[call-arg]
+            grad, hess = self.func(labels, preds, dataset.get_field("weight"))  # type: ignore[call-arg]
         elif argc == 4:
-            grad, hess = self.func(labels, preds, dataset.get_weight(), dataset.get_group())  # type: ignore [call-arg]
+            group = dataset.get_field("group")
+            if group is not None:
+                return self.func(labels, preds, dataset.get_field("weight"), np.diff(group))  # type: ignore[call-arg]
+            else:
+                return self.func(labels, preds, dataset.get_field("weight"), group)  # type: ignore[call-arg]
         else:
             raise TypeError(f"Self-defined objective function should have 2, 3 or 4 arguments, got {argc}")
         return grad, hess
@@ -229,14 +233,18 @@ def __call__(
         is_higher_better : bool
             Is eval result higher better, e.g. AUC is ``is_higher_better``.
         """
-        labels = dataset.get_label()
+        labels = dataset.get_field("label")
         argc = len(signature(self.func).parameters)
         if argc == 2:
             return self.func(labels, preds)  # type: ignore[call-arg]
         elif argc == 3:
-            return self.func(labels, preds, dataset.get_weight())  # type: ignore[call-arg]
+            return self.func(labels, preds, dataset.get_field("weight"))  # type: ignore[call-arg]
         elif argc == 4:
-            return self.func(labels, preds, dataset.get_weight(), dataset.get_group())  # type: ignore[call-arg]
+            group = dataset.get_field("group")
+            if group is not None:
+                return self.func(labels, preds, dataset.get_field("weight"), np.diff(group))  # type: ignore[call-arg]
+            else:
+                return self.func(labels, preds, dataset.get_field("weight"), group)  # type: ignore[call-arg]
         else:
             raise TypeError(f"Self-defined eval function should have 2, 3 or 4 arguments, got {argc}")
 

@@ -499,6 +499,88 @@ def check_asserts(data):
     check_asserts(lgb_data)
 
 
+def test_dataset_construction_overwrites_user_provided_metadata_fields():
+
+    X = np.array([[1.0, 2.0], [3.0, 4.0]])
+
+    dtrain = lgb.Dataset(
+        X,
+        params={
+            "min_data_in_bin": 1,
+            "min_data_in_leaf": 1,
+            "verbosity": -1
+        },
+        group=[1, 1],
+        init_score=[0.312, 0.708],
+        label=[1, 2],
+        position=np.array([0.0, 1.0], dtype=np.float32),
+        weight=[0.5, 1.5],
+    )
+
+    # unconstructed, get_* methods should return whatever was provided
+    assert dtrain.group == [1, 1]
+    assert dtrain.get_group() == [1, 1]
+    assert dtrain.init_score == [0.312, 0.708]
+    assert dtrain.get_init_score() == [0.312, 0.708]
+    assert dtrain.label == [1, 2]
+    assert dtrain.get_label() == [1, 2]
+    np.testing.assert_array_equal(
+        dtrain.position,
+        np.array([0.0, 1.0], dtype=np.float32),
+        strict=True
+    )
+    np.testing.assert_array_equal(
+        dtrain.get_position(),
+        np.array([0.0, 1.0], dtype=np.float32),
+        strict=True
+    )
+    assert dtrain.weight == [0.5, 1.5]
+    assert dtrain.get_weight() == [0.5, 1.5]
+
+    # before construction, get_field() should raise an exception
+    for field_name in ["group", "init_score", "label", "position", "weight"]:
+        with pytest.raises(Exception, match=f"Cannot get {field_name} before construct Dataset"):
+            dtrain.get_field(field_name)
+
+    # constructed, get_* methods should return numpy arrays, even when the provided
+    # input was a list of floats or ints
+    dtrain.construct()
+    expected_group = [1, 1]
+    assert dtrain.group == expected_group
+    assert dtrain.get_group() == expected_group
+    # get_field("group") returns a numpy array with boundaries, instead of size
+    np.testing.assert_array_equal(
+        dtrain.get_field("group"),
+        np.array([0, 1, 2], dtype=np.int32),
+        strict=True
+    )
+
+    expected_init_score = np.array([0.312, 0.708])
+    np.testing.assert_array_equal(dtrain.init_score, expected_init_score, strict=True)
+    np.testing.assert_array_equal(dtrain.get_init_score(), expected_init_score, strict=True)
+    np.testing.assert_array_equal(dtrain.get_field("init_score"), expected_init_score, strict=True)
+
+    expected_label = np.array([1, 2], dtype=np.float32)
+    np.testing.assert_array_equal(dtrain.label, expected_label, strict=True)
+    np.testing.assert_array_equal(dtrain.get_label(), expected_label, strict=True)
+    np.testing.assert_array_equal(dtrain.get_field("label"), expected_label, strict=True)
+
+    expected_position = np.array([0.0, 1.0], dtype=np.float32)
+    np.testing.assert_array_equal(dtrain.position, expected_position, strict=True)
+    np.testing.assert_array_equal(dtrain.get_position(), expected_position, strict=True)
+    # NOTE: "position" is converted to int32 on thhe C++ side
+    np.testing.assert_array_equal(
+        dtrain.get_field("position"),
+        np.array([0.0, 1.0], dtype=np.int32),
+        strict=True
+    )
+
+    expected_weight = np.array([0.5, 1.5], dtype=np.float32)
+    np.testing.assert_array_equal(dtrain.weight, expected_weight, strict=True)
+    np.testing.assert_array_equal(dtrain.get_weight(), expected_weight, strict=True)
+    np.testing.assert_array_equal(dtrain.get_field("weight"), expected_weight, strict=True)
+
+
 def test_choose_param_value():
 
     original_params = {