Skip to content

Commit

Permalink
[python] add sparsity support for new version of pandas and check Ser…
Browse files Browse the repository at this point in the history
…ies for bad dtypes (#2318)

* reworked pandas dtypes mapper

* added tests

* added sparsity support for new version of pandas

* fixed tests for old pandas

* check pd.Series for bad dtypes as well

* enhanced tests

* fixed pylint
  • Loading branch information
StrikerRUS committed Aug 13, 2019
1 parent 9cf6b82 commit 8f446be
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 14 deletions.
36 changes: 22 additions & 14 deletions python-package/lightgbm/basic.py
Expand Up @@ -13,7 +13,7 @@
import numpy as np
import scipy.sparse

from .compat import (PANDAS_INSTALLED, DataFrame, Series,
from .compat import (PANDAS_INSTALLED, DataFrame, Series, is_dtype_sparse,
DataTable,
decode_string, string_type,
integer_types, numeric_types,
Expand Down Expand Up @@ -78,7 +78,12 @@ def list_to_1d_numpy(data, dtype=np.float32, name='list'):
elif is_1d_list(data):
return np.array(data, dtype=dtype, copy=False)
elif isinstance(data, Series):
return data.values.astype(dtype)
if _get_bad_pandas_dtypes([data.dtypes]):
raise ValueError('Series.dtypes must be int, float or bool')
if hasattr(data.values, 'values'): # SparseArray
return data.values.values.astype(dtype)
else:
return data.values.astype(dtype)
else:
raise TypeError("Wrong type({0}) for {1}.\n"
"It should be list, numpy 1-D array or pandas Series".format(type(data).__name__, name))
Expand Down Expand Up @@ -194,11 +199,6 @@ class LightGBMError(Exception):
"feature_penalty": C_API_DTYPE_FLOAT64,
"monotone_constraints": C_API_DTYPE_INT8}

PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'int64': 'int', 'uint8': 'int', 'uint16': 'int',
'uint32': 'int', 'uint64': 'int', 'bool': 'int',
'float16': 'float', 'float32': 'float', 'float64': 'float'}


def convert_from_sliced_object(data):
"""Fix the memory of multi-dimensional sliced object."""
Expand Down Expand Up @@ -252,6 +252,17 @@ def c_int_array(data):
return (ptr_data, type_data, data) # return `data` to avoid the temporary copy is freed


def _get_bad_pandas_dtypes(dtypes):
pandas_dtype_mapper = {'int8': 'int', 'int16': 'int', 'int32': 'int',
'int64': 'int', 'uint8': 'int', 'uint16': 'int',
'uint32': 'int', 'uint64': 'int', 'bool': 'int',
'float16': 'float', 'float32': 'float', 'float64': 'float'}
bad_indices = [i for i, dtype in enumerate(dtypes) if (dtype.name not in pandas_dtype_mapper
and (not is_dtype_sparse(dtype)
or dtype.subtype.name not in pandas_dtype_mapper))]
return bad_indices


def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorical):
if isinstance(data, DataFrame):
if len(data.shape) != 2 or data.shape[0] < 1:
Expand Down Expand Up @@ -280,13 +291,11 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
categorical_feature = list(categorical_feature)
if feature_name == 'auto':
feature_name = list(data.columns)
data_dtypes = data.dtypes
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes):
bad_fields = [data.columns[i] for i, dtype in
enumerate(data_dtypes) if dtype.name not in PANDAS_DTYPE_MAPPER]
bad_indices = _get_bad_pandas_dtypes(data.dtypes)
if bad_indices:
raise ValueError("DataFrame.dtypes for data must be int, float or bool.\n"
"Did not expect the data types in the following fields: "
+ ', '.join(bad_fields))
+ ', '.join(data.columns[bad_indices]))
data = data.values.astype('float')
else:
if feature_name == 'auto':
Expand All @@ -300,8 +309,7 @@ def _label_from_pandas(label):
if isinstance(label, DataFrame):
if len(label.columns) > 1:
raise ValueError('DataFrame for label cannot have multiple columns')
label_dtypes = label.dtypes
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in label_dtypes):
if _get_bad_pandas_dtypes(label.dtypes):
raise ValueError('DataFrame.dtypes for label must be int, float or bool')
label = label.values.astype('float').flatten()
return label
Expand Down
3 changes: 3 additions & 0 deletions python-package/lightgbm/compat.py
Expand Up @@ -62,6 +62,7 @@ def json_default_with_numpy(obj):
"""pandas"""
try:
from pandas import Series, DataFrame
from pandas.api.types import is_sparse as is_dtype_sparse
PANDAS_INSTALLED = True
except ImportError:
PANDAS_INSTALLED = False
Expand All @@ -76,6 +77,8 @@ class DataFrame(object):

pass

is_dtype_sparse = None

"""matplotlib"""
try:
import matplotlib
Expand Down
26 changes: 26 additions & 0 deletions tests/python_package_test/test_engine.py
Expand Up @@ -719,6 +719,32 @@ def test_pandas_categorical(self):
self.assertListEqual(gbm6.pandas_categorical, cat_values)
self.assertListEqual(gbm7.pandas_categorical, cat_values)

@unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
def test_pandas_sparse(self):
import pandas as pd
X = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 1, 2] * 100)),
"B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)),
"C": pd.SparseArray(np.random.permutation([True, False] * 150))})
y = pd.Series(pd.SparseArray(np.random.permutation([0, 1] * 150)))
X_test = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 2] * 30)),
"B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1] * 15)),
"C": pd.SparseArray(np.random.permutation([True, False] * 30))})
if pd.__version__ >= '0.24.0':
for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]):
self.assertTrue(pd.api.types.is_sparse(dtype))
params = {
'objective': 'binary',
'verbose': -1
}
lgb_train = lgb.Dataset(X, y)
gbm = lgb.train(params, lgb_train, num_boost_round=10)
pred_sparse = gbm.predict(X_test, raw_score=True)
if hasattr(X_test, 'sparse'):
pred_dense = gbm.predict(X_test.sparse.to_dense(), raw_score=True)
else:
pred_dense = gbm.predict(X_test.to_dense(), raw_score=True)
np.testing.assert_allclose(pred_sparse, pred_dense)

def test_reference_chain(self):
X = np.random.normal(size=(100, 2))
y = np.random.normal(size=100)
Expand Down
21 changes: 21 additions & 0 deletions tests/python_package_test/test_sklearn.py
Expand Up @@ -277,6 +277,27 @@ def test_pandas_categorical(self):
self.assertListEqual(gbm5.booster_.pandas_categorical, cat_values)
self.assertListEqual(gbm6.booster_.pandas_categorical, cat_values)

@unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
def test_pandas_sparse(self):
import pandas as pd
X = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 1, 2] * 100)),
"B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)),
"C": pd.SparseArray(np.random.permutation([True, False] * 150))})
y = pd.Series(pd.SparseArray(np.random.permutation([0, 1] * 150)))
X_test = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 2] * 30)),
"B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1] * 15)),
"C": pd.SparseArray(np.random.permutation([True, False] * 30))})
if pd.__version__ >= '0.24.0':
for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]):
self.assertTrue(pd.api.types.is_sparse(dtype))
gbm = lgb.sklearn.LGBMClassifier().fit(X, y)
pred_sparse = gbm.predict(X_test, raw_score=True)
if hasattr(X_test, 'sparse'):
pred_dense = gbm.predict(X_test.sparse.to_dense(), raw_score=True)
else:
pred_dense = gbm.predict(X_test.to_dense(), raw_score=True)
np.testing.assert_allclose(pred_sparse, pred_dense)

def test_predict(self):
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,
Expand Down

0 comments on commit 8f446be

Please sign in to comment.