From 462cf16a71103a4f0e0a7091388defd259a79ee2 Mon Sep 17 00:00:00 2001 From: StrikerRUS Date: Sun, 18 Nov 2018 23:25:28 +0300 Subject: [PATCH 1/6] added get_data method --- docs/FAQ.rst | 8 ++++---- python-package/lightgbm/basic.py | 23 +++++++++++++++++++++-- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/docs/FAQ.rst b/docs/FAQ.rst index b9b949dff3f..fda92af8e9d 100644 --- a/docs/FAQ.rst +++ b/docs/FAQ.rst @@ -211,12 +211,12 @@ Python-package If you set ``free_raw_data=True`` (default), the raw data (with Python data struct) will be freed. So, if you want to: - - get label (or weight/init\_score/group) before constructing a dataset, it's same as get ``self.label`` + - get label (or weight/init\_score/group) before constructing a dataset, it's same as get ``self.label``; - - set label (or weight/init\_score/group) before constructing a dataset, it's same as ``self.label=some_label_array`` + - set label (or weight/init\_score/group) before constructing a dataset, it's same as ``self.label=some_label_array``; - get num\_data (or num\_feature) before constructing a dataset, you can get data with ``self.data``. - Then, if your data is ``numpy.ndarray``, use some code like ``self.data.shape`` + Then, if your data is ``numpy.ndarray``, use some code like ``self.data.shape``. But do not do this after subsetting the Dataset, you'll get unsliced raw data; - set predictor (or reference/categorical feature) after constructing a dataset, - you should set ``free_raw_data=False`` or init a Dataset object with the same raw data + you should set ``free_raw_data=False`` or init a Dataset object with the same raw data. diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index b347f73ce41..a98aedfcaec 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -687,6 +687,7 @@ def __init__(self, data, label=None, reference=None, self.params = copy.deepcopy(params) self.free_raw_data = free_raw_data self.used_indices = None + self.need_slice = True self._predictor = None self.pandas_categorical = None self.params_back_up = None @@ -974,6 +975,7 @@ def construct(self): ctypes.c_int(used_indices.shape[0]), c_str(params_str), ctypes.byref(self.handle))) + self.data = self.get_data() if self.group is not None: self.set_group(self.group) if self.get_label() is None: @@ -1040,8 +1042,9 @@ def subset(self, used_indices, params=None): """ if params is None: params = self.params - ret = Dataset(None, reference=self, feature_name=self.feature_name, - categorical_feature=self.categorical_feature, params=params) + ret = Dataset(self.data, reference=self, feature_name=self.feature_name, + categorical_feature=self.categorical_feature, params=params, + free_raw_data=self.free_raw_data) ret._predictor = self._predictor ret.pandas_categorical = self.pandas_categorical ret.used_indices = used_indices @@ -1375,6 +1378,22 @@ def get_init_score(self): self.init_score = self.get_field('init_score') return self.init_score + def get_data(self): + """Get the raw data of the Dataset. + + Returns + ------- + data : string, numpy array, pandas DataFrame, scipy.sparse, list of numpy arrays or None + Raw data used in the Dataset construction. + """ + if self.data is not None and self.used_indices is not None and self.need_slice: + if isinstance(self.data, np.ndarray) or scipy.sparse.issparse(self.data): + self.data = self.data[self.used_indices, :] + elif isinstance(self.data, DataFrame): + self.data = self.data.iloc[self.used_indices] + self.need_slice = False + return self.data + def get_group(self): """Get the group of the Dataset. From daf85e50f8b0d008919d04f6c9b3d1c9f589b43c Mon Sep 17 00:00:00 2001 From: StrikerRUS Date: Sat, 24 Nov 2018 01:24:21 +0300 Subject: [PATCH 2/6] hotfix --- python-package/lightgbm/basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index a98aedfcaec..a354b6e79ac 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -1390,7 +1390,7 @@ def get_data(self): if isinstance(self.data, np.ndarray) or scipy.sparse.issparse(self.data): self.data = self.data[self.used_indices, :] elif isinstance(self.data, DataFrame): - self.data = self.data.iloc[self.used_indices] + self.data = self.data.iloc[self.used_indices].copy() self.need_slice = False return self.data From c36b89f84232d052ff68342706ea80c6d63a0678 Mon Sep 17 00:00:00 2001 From: StrikerRUS Date: Mon, 10 Dec 2018 15:28:09 +0300 Subject: [PATCH 3/6] added warning for other data types --- python-package/lightgbm/basic.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index a354b6e79ac..d4beaa8aee0 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -1391,6 +1391,9 @@ def get_data(self): self.data = self.data[self.used_indices, :] elif isinstance(self.data, DataFrame): self.data = self.data.iloc[self.used_indices].copy() + else: + warnings.warn("Cannot subset {} type of raw data.\n" + "Returning original raw data".format(type(self.data).__name__)) self.need_slice = False return self.data From 1ff6c6d6bca05de2c13490264ba3b6bf7f993d4a Mon Sep 17 00:00:00 2001 From: StrikerRUS Date: Mon, 10 Dec 2018 16:18:05 +0300 Subject: [PATCH 4/6] reworked according to review comments --- docs/FAQ.rst | 2 +- python-package/lightgbm/basic.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/FAQ.rst b/docs/FAQ.rst index d222a8b0bc9..d653b649c8a 100644 --- a/docs/FAQ.rst +++ b/docs/FAQ.rst @@ -216,7 +216,7 @@ Python-package - set label (or weight/init\_score/group) before constructing a dataset, it's same as ``self.label=some_label_array``; - get num\_data (or num\_feature) before constructing a dataset, you can get data with ``self.data``. - Then, if your data is ``numpy.ndarray``, use some code like ``self.data.shape``. But do not do this after subsetting the Dataset, you'll get unsliced raw data; + Then, if your data is ``numpy.ndarray``, use some code like ``self.data.shape``. But do not do this after subsetting the Dataset, because you'll get always ``None``; - set predictor (or reference/categorical feature) after constructing a dataset, you should set ``free_raw_data=False`` or init a Dataset object with the same raw data. diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index d4beaa8aee0..83d0c581fef 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -975,7 +975,8 @@ def construct(self): ctypes.c_int(used_indices.shape[0]), c_str(params_str), ctypes.byref(self.handle))) - self.data = self.get_data() + self.data = self.reference.data + self.get_data() if self.group is not None: self.set_group(self.group) if self.get_label() is None: @@ -1042,7 +1043,7 @@ def subset(self, used_indices, params=None): """ if params is None: params = self.params - ret = Dataset(self.data, reference=self, feature_name=self.feature_name, + ret = Dataset(None, reference=self, feature_name=self.feature_name, categorical_feature=self.categorical_feature, params=params, free_raw_data=self.free_raw_data) ret._predictor = self._predictor @@ -1386,6 +1387,8 @@ def get_data(self): data : string, numpy array, pandas DataFrame, scipy.sparse, list of numpy arrays or None Raw data used in the Dataset construction. """ + if self.handle is None: + raise Exception("Cannot get data before construct Dataset") if self.data is not None and self.used_indices is not None and self.need_slice: if isinstance(self.data, np.ndarray) or scipy.sparse.issparse(self.data): self.data = self.data[self.used_indices, :] From c15c8e41376d0998e02a90fc1ee4605875065988 Mon Sep 17 00:00:00 2001 From: StrikerRUS Date: Fri, 14 Dec 2018 22:43:57 +0300 Subject: [PATCH 5/6] minor addition to FAQ --- docs/FAQ.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/FAQ.rst b/docs/FAQ.rst index d653b649c8a..42cd91868af 100644 --- a/docs/FAQ.rst +++ b/docs/FAQ.rst @@ -211,7 +211,7 @@ Python-package If you set ``free_raw_data=True`` (default), the raw data (with Python data struct) will be freed. So, if you want to: - - get label (or weight/init\_score/group) before constructing a dataset, it's same as get ``self.label``; + - get label (or weight/init\_score/group/data) before constructing a dataset, it's same as get ``self.label``; - set label (or weight/init\_score/group) before constructing a dataset, it's same as ``self.label=some_label_array``; From 4bed1ae4f83ee59410c25726c31e02f3e3c14ecd Mon Sep 17 00:00:00 2001 From: StrikerRUS Date: Fri, 14 Dec 2018 23:28:19 +0300 Subject: [PATCH 6/6] added test --- tests/python_package_test/test_engine.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 347a8d1b99f..5958957ec8d 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -808,3 +808,23 @@ def test_constant_features_multiclassova(self): } self.test_constant_features([0.0, 1.0, 2.0, 0.0], [0.5, 0.25, 0.25], params) self.test_constant_features([0.0, 1.0, 2.0, 1.0], [0.25, 0.5, 0.25], params) + + def test_fpreproc(self): + def preprocess_data(dtrain, dtest, params): + train_data = dtrain.construct().get_data() + test_data = dtest.construct().get_data() + train_data[:, 0] += 1 + test_data[:, 0] += 1 + dtrain.label[-5:] = 3 + dtest.label[-5:] = 3 + dtrain = lgb.Dataset(train_data, dtrain.label) + dtest = lgb.Dataset(test_data, dtest.label, reference=dtrain) + params['num_class'] = 4 + return dtrain, dtest, params + + X, y = load_iris(True) + dataset = lgb.Dataset(X, y, free_raw_data=False) + params = {'objective': 'multiclass', 'num_class': 3, 'verbose': -1} + results = lgb.cv(params, dataset, num_boost_round=10, fpreproc=preprocess_data) + self.assertIn('multi_logloss-mean', results) + self.assertEqual(len(results['multi_logloss-mean']), 10)