Skip to content

Commit

Permalink
[python] added get_data() method to Dataset class (#1870)
Browse files Browse the repository at this point in the history
* added get_data method

* hotfix

* added warning for other data types

* reworked according to review comments

* minor addition to FAQ

* added test
  • Loading branch information
StrikerRUS authored and guolinke committed Dec 20, 2018
1 parent 92e95e6 commit 2323cb3
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 5 deletions.
8 changes: 4 additions & 4 deletions docs/FAQ.rst
Expand Up @@ -211,15 +211,15 @@ Python-package
If you set ``free_raw_data=True`` (default), the raw data (with Python data struct) will be freed.
So, if you want to:

- get label (or weight/init\_score/group) before constructing a dataset, it's same as get ``self.label``
- get label (or weight/init\_score/group/data) before constructing a dataset, it's same as get ``self.label``;

- set label (or weight/init\_score/group) before constructing a dataset, it's same as ``self.label=some_label_array``
- set label (or weight/init\_score/group) before constructing a dataset, it's same as ``self.label=some_label_array``;

- get num\_data (or num\_feature) before constructing a dataset, you can get data with ``self.data``.
Then, if your data is ``numpy.ndarray``, use some code like ``self.data.shape``
Then, if your data is ``numpy.ndarray``, use some code like ``self.data.shape``. But do not do this after subsetting the Dataset, because you'll get always ``None``;

- set predictor (or reference/categorical feature) after constructing a dataset,
you should set ``free_raw_data=False`` or init a Dataset object with the same raw data
you should set ``free_raw_data=False`` or init a Dataset object with the same raw data.

--------------

Expand Down
27 changes: 26 additions & 1 deletion python-package/lightgbm/basic.py
Expand Up @@ -687,6 +687,7 @@ def __init__(self, data, label=None, reference=None,
self.params = copy.deepcopy(params)
self.free_raw_data = free_raw_data
self.used_indices = None
self.need_slice = True
self._predictor = None
self.pandas_categorical = None
self.params_back_up = None
Expand Down Expand Up @@ -974,6 +975,8 @@ def construct(self):
ctypes.c_int(used_indices.shape[0]),
c_str(params_str),
ctypes.byref(self.handle)))
self.data = self.reference.data
self.get_data()
if self.group is not None:
self.set_group(self.group)
if self.get_label() is None:
Expand Down Expand Up @@ -1041,7 +1044,8 @@ def subset(self, used_indices, params=None):
if params is None:
params = self.params
ret = Dataset(None, reference=self, feature_name=self.feature_name,
categorical_feature=self.categorical_feature, params=params)
categorical_feature=self.categorical_feature, params=params,
free_raw_data=self.free_raw_data)
ret._predictor = self._predictor
ret.pandas_categorical = self.pandas_categorical
ret.used_indices = used_indices
Expand Down Expand Up @@ -1375,6 +1379,27 @@ def get_init_score(self):
self.init_score = self.get_field('init_score')
return self.init_score

def get_data(self):
"""Get the raw data of the Dataset.
Returns
-------
data : string, numpy array, pandas DataFrame, scipy.sparse, list of numpy arrays or None
Raw data used in the Dataset construction.
"""
if self.handle is None:
raise Exception("Cannot get data before construct Dataset")
if self.data is not None and self.used_indices is not None and self.need_slice:
if isinstance(self.data, np.ndarray) or scipy.sparse.issparse(self.data):
self.data = self.data[self.used_indices, :]
elif isinstance(self.data, DataFrame):
self.data = self.data.iloc[self.used_indices].copy()
else:
warnings.warn("Cannot subset {} type of raw data.\n"
"Returning original raw data".format(type(self.data).__name__))
self.need_slice = False
return self.data

def get_group(self):
"""Get the group of the Dataset.
Expand Down
20 changes: 20 additions & 0 deletions tests/python_package_test/test_engine.py
Expand Up @@ -808,3 +808,23 @@ def test_constant_features_multiclassova(self):
}
self.test_constant_features([0.0, 1.0, 2.0, 0.0], [0.5, 0.25, 0.25], params)
self.test_constant_features([0.0, 1.0, 2.0, 1.0], [0.25, 0.5, 0.25], params)

def test_fpreproc(self):
def preprocess_data(dtrain, dtest, params):
train_data = dtrain.construct().get_data()
test_data = dtest.construct().get_data()
train_data[:, 0] += 1
test_data[:, 0] += 1
dtrain.label[-5:] = 3
dtest.label[-5:] = 3
dtrain = lgb.Dataset(train_data, dtrain.label)
dtest = lgb.Dataset(test_data, dtest.label, reference=dtrain)
params['num_class'] = 4
return dtrain, dtest, params

X, y = load_iris(True)
dataset = lgb.Dataset(X, y, free_raw_data=False)
params = {'objective': 'multiclass', 'num_class': 3, 'verbose': -1}
results = lgb.cv(params, dataset, num_boost_round=10, fpreproc=preprocess_data)
self.assertIn('multi_logloss-mean', results)
self.assertEqual(len(results['multi_logloss-mean']), 10)

0 comments on commit 2323cb3

Please sign in to comment.