diff --git a/docs/FAQ.rst b/docs/FAQ.rst index 8cc1df93aac5..42cd91868af5 100644 --- a/docs/FAQ.rst +++ b/docs/FAQ.rst @@ -211,15 +211,15 @@ Python-package If you set ``free_raw_data=True`` (default), the raw data (with Python data struct) will be freed. So, if you want to: - - get label (or weight/init\_score/group) before constructing a dataset, it's same as get ``self.label`` + - get label (or weight/init\_score/group/data) before constructing a dataset, it's same as get ``self.label``; - - set label (or weight/init\_score/group) before constructing a dataset, it's same as ``self.label=some_label_array`` + - set label (or weight/init\_score/group) before constructing a dataset, it's same as ``self.label=some_label_array``; - get num\_data (or num\_feature) before constructing a dataset, you can get data with ``self.data``. - Then, if your data is ``numpy.ndarray``, use some code like ``self.data.shape`` + Then, if your data is ``numpy.ndarray``, use some code like ``self.data.shape``. But do not do this after subsetting the Dataset, because you'll get always ``None``; - set predictor (or reference/categorical feature) after constructing a dataset, - you should set ``free_raw_data=False`` or init a Dataset object with the same raw data + you should set ``free_raw_data=False`` or init a Dataset object with the same raw data. -------------- diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index b347f73ce41a..83d0c581fef0 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -687,6 +687,7 @@ def __init__(self, data, label=None, reference=None, self.params = copy.deepcopy(params) self.free_raw_data = free_raw_data self.used_indices = None + self.need_slice = True self._predictor = None self.pandas_categorical = None self.params_back_up = None @@ -974,6 +975,8 @@ def construct(self): ctypes.c_int(used_indices.shape[0]), c_str(params_str), ctypes.byref(self.handle))) + self.data = self.reference.data + self.get_data() if self.group is not None: self.set_group(self.group) if self.get_label() is None: @@ -1041,7 +1044,8 @@ def subset(self, used_indices, params=None): if params is None: params = self.params ret = Dataset(None, reference=self, feature_name=self.feature_name, - categorical_feature=self.categorical_feature, params=params) + categorical_feature=self.categorical_feature, params=params, + free_raw_data=self.free_raw_data) ret._predictor = self._predictor ret.pandas_categorical = self.pandas_categorical ret.used_indices = used_indices @@ -1375,6 +1379,27 @@ def get_init_score(self): self.init_score = self.get_field('init_score') return self.init_score + def get_data(self): + """Get the raw data of the Dataset. + + Returns + ------- + data : string, numpy array, pandas DataFrame, scipy.sparse, list of numpy arrays or None + Raw data used in the Dataset construction. + """ + if self.handle is None: + raise Exception("Cannot get data before construct Dataset") + if self.data is not None and self.used_indices is not None and self.need_slice: + if isinstance(self.data, np.ndarray) or scipy.sparse.issparse(self.data): + self.data = self.data[self.used_indices, :] + elif isinstance(self.data, DataFrame): + self.data = self.data.iloc[self.used_indices].copy() + else: + warnings.warn("Cannot subset {} type of raw data.\n" + "Returning original raw data".format(type(self.data).__name__)) + self.need_slice = False + return self.data + def get_group(self): """Get the group of the Dataset. diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 347a8d1b99f4..5958957ec8dd 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -808,3 +808,23 @@ def test_constant_features_multiclassova(self): } self.test_constant_features([0.0, 1.0, 2.0, 0.0], [0.5, 0.25, 0.25], params) self.test_constant_features([0.0, 1.0, 2.0, 1.0], [0.25, 0.5, 0.25], params) + + def test_fpreproc(self): + def preprocess_data(dtrain, dtest, params): + train_data = dtrain.construct().get_data() + test_data = dtest.construct().get_data() + train_data[:, 0] += 1 + test_data[:, 0] += 1 + dtrain.label[-5:] = 3 + dtest.label[-5:] = 3 + dtrain = lgb.Dataset(train_data, dtrain.label) + dtest = lgb.Dataset(test_data, dtest.label, reference=dtrain) + params['num_class'] = 4 + return dtrain, dtest, params + + X, y = load_iris(True) + dataset = lgb.Dataset(X, y, free_raw_data=False) + params = {'objective': 'multiclass', 'num_class': 3, 'verbose': -1} + results = lgb.cv(params, dataset, num_boost_round=10, fpreproc=preprocess_data) + self.assertIn('multi_logloss-mean', results) + self.assertEqual(len(results['multi_logloss-mean']), 10)