Skip to content

Commit

Permalink
[python] avoid data copy where possible (#2383)
Browse files Browse the repository at this point in the history
* avoid copy where possible

* use precise type for importance type

* removed pointless code

* simplify sparse pandas Series conversion

* more memory savings

* always force type conversion for 1-D arrays

* one more copy=False
  • Loading branch information
StrikerRUS committed Sep 26, 2019
1 parent 7a8c4e5 commit d064019
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 17 deletions.
24 changes: 10 additions & 14 deletions python-package/lightgbm/basic.py
Expand Up @@ -80,10 +80,7 @@ def list_to_1d_numpy(data, dtype=np.float32, name='list'):
elif isinstance(data, Series):
if _get_bad_pandas_dtypes([data.dtypes]):
raise ValueError('Series.dtypes must be int, float or bool')
if hasattr(data.values, 'values'): # SparseArray
return data.values.values.astype(dtype)
else:
return data.values.astype(dtype)
return np.array(data, dtype=dtype, copy=False) # SparseArray should be supported as well
else:
raise TypeError("Wrong type({0}) for {1}.\n"
"It should be list, numpy 1-D array or pandas Series".format(type(data).__name__, name))
Expand Down Expand Up @@ -296,7 +293,9 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
raise ValueError("DataFrame.dtypes for data must be int, float or bool.\n"
"Did not expect the data types in the following fields: "
+ ', '.join(data.columns[bad_indices]))
data = data.values.astype('float')
data = data.values
if data.dtype != np.float32 and data.dtype != np.float64:
data = data.astype(np.float32)
else:
if feature_name == 'auto':
feature_name = None
Expand All @@ -311,7 +310,7 @@ def _label_from_pandas(label):
raise ValueError('DataFrame for label cannot have multiple columns')
if _get_bad_pandas_dtypes(label.dtypes):
raise ValueError('DataFrame.dtypes for label must be int, float or bool')
label = label.values.astype('float').flatten()
label = np.ravel(label.values.astype(np.float32, copy=False))
return label


Expand Down Expand Up @@ -534,8 +533,7 @@ def __pred_for_np2d(self, mat, num_iteration, predict_type):
def inner_predict(mat, num_iteration, predict_type, preds=None):
if mat.dtype == np.float32 or mat.dtype == np.float64:
data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
else:
"""change non-float data to float data, need to copy"""
else: # change non-float data to float data, need to copy
data = np.array(mat.reshape(mat.size), dtype=np.float32)
ptr_data, type_ptr_data, _ = c_float_array(data)
n_preds = self.__get_num_preds(num_iteration, mat.shape[0], predict_type)
Expand Down Expand Up @@ -876,8 +874,7 @@ def __init_from_np2d(self, mat, params_str, ref_dataset):
self.handle = ctypes.c_void_p()
if mat.dtype == np.float32 or mat.dtype == np.float64:
data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
else:
# change non-float data to float data, need to copy
else: # change non-float data to float data, need to copy
data = np.array(mat.reshape(mat.size), dtype=np.float32)

ptr_data, type_ptr_data, _ = c_float_array(data)
Expand Down Expand Up @@ -915,8 +912,7 @@ def __init_from_list_np2d(self, mats, params_str, ref_dataset):

if mat.dtype == np.float32 or mat.dtype == np.float64:
mats[i] = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
else:
# change non-float data to float data, need to copy
else: # change non-float data to float data, need to copy
mats[i] = np.array(mat.reshape(mat.size), dtype=np.float32)

chunk_ptr_data, chunk_type_ptr_data, holder = c_float_array(mats[i])
Expand Down Expand Up @@ -1012,7 +1008,7 @@ def construct(self):
used_indices = list_to_1d_numpy(self.used_indices, np.int32, name='used_indices')
assert used_indices.flags.c_contiguous
if self.reference.group is not None:
group_info = np.array(self.reference.group).astype(int)
group_info = np.array(self.reference.group).astype(np.int32, copy=False)
_, self.group = np.unique(np.repeat(range_(len(group_info)), repeats=group_info)[self.used_indices],
return_counts=True)
self.handle = ctypes.c_void_p()
Expand Down Expand Up @@ -2512,7 +2508,7 @@ def feature_importance(self, importance_type='split', iteration=None):
ctypes.c_int(importance_type_int),
result.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
if importance_type_int == 0:
return result.astype(int)
return result.astype(np.int32)
else:
return result

Expand Down
6 changes: 3 additions & 3 deletions python-package/lightgbm/engine.py
Expand Up @@ -308,17 +308,17 @@ def _make_n_folds(full_data, folds, nfold, params, seed, fpreproc=None, stratifi
if hasattr(folds, 'split'):
group_info = full_data.get_group()
if group_info is not None:
group_info = np.array(group_info, dtype=int)
group_info = np.array(group_info, dtype=np.int32, copy=False)
flatted_group = np.repeat(range_(len(group_info)), repeats=group_info)
else:
flatted_group = np.zeros(num_data, dtype=int)
flatted_group = np.zeros(num_data, dtype=np.int32)
folds = folds.split(X=np.zeros(num_data), y=full_data.get_label(), groups=flatted_group)
else:
if 'objective' in params and params['objective'] == 'lambdarank':
if not SKLEARN_INSTALLED:
raise LightGBMError('Scikit-learn is required for lambdarank cv.')
# lambdarank task, split according to groups
group_info = np.array(full_data.get_group(), dtype=int)
group_info = np.array(full_data.get_group(), dtype=np.int32, copy=False)
flatted_group = np.repeat(range_(len(group_info)), repeats=group_info)
group_kfold = _LGBMGroupKFold(n_splits=nfold)
folds = group_kfold.split(X=np.zeros(num_data), groups=flatted_group)
Expand Down

0 comments on commit d064019

Please sign in to comment.