Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[python-package] simplify processing of pandas data #6066

Merged
merged 9 commits into from
Sep 6, 2023
118 changes: 62 additions & 56 deletions python-package/lightgbm/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -668,57 +668,52 @@ def _check_for_bad_pandas_dtypes(pandas_dtypes_series: pd_Series) -> None:


def _data_from_pandas(
data,
feature_name: Optional[_LGBM_FeatureNameConfiguration],
categorical_feature: Optional[_LGBM_CategoricalFeatureConfiguration],
data: pd_DataFrame,
feature_name: _LGBM_FeatureNameConfiguration,
categorical_feature: _LGBM_CategoricalFeatureConfiguration,
pandas_categorical: Optional[List[List]]
):
if isinstance(data, pd_DataFrame):
if len(data.shape) != 2 or data.shape[0] < 1:
raise ValueError('Input data must be 2 dimensional and non empty.')
if feature_name == 'auto' or feature_name is None:
data = data.rename(columns=str, copy=False)
cat_cols = [col for col, dtype in zip(data.columns, data.dtypes) if isinstance(dtype, pd_CategoricalDtype)]
cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered]
if pandas_categorical is None: # train dataset
pandas_categorical = [list(data[col].cat.categories) for col in cat_cols]
else:
if len(cat_cols) != len(pandas_categorical):
raise ValueError('train and valid dataset categorical_feature do not match.')
for col, category in zip(cat_cols, pandas_categorical):
if list(data[col].cat.categories) != list(category):
data[col] = data[col].cat.set_categories(category)
if len(cat_cols): # cat_cols is list
data = data.copy(deep=False) # not alter origin DataFrame
data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan})
if categorical_feature is not None:
if feature_name is None:
feature_name = list(data.columns)
if categorical_feature == 'auto': # use cat cols from DataFrame
categorical_feature = cat_cols_not_ordered
else: # use cat cols specified by user
categorical_feature = list(categorical_feature) # type: ignore[assignment]
if feature_name == 'auto':
feature_name = list(data.columns)
_check_for_bad_pandas_dtypes(data.dtypes)
df_dtypes = [dtype.type for dtype in data.dtypes]
df_dtypes.append(np.float32) # so that the target dtype considers floats
target_dtype = np.result_type(*df_dtypes)
try:
# most common case (no nullable dtypes)
data = data.to_numpy(dtype=target_dtype, copy=False)
except TypeError:
# 1.0 <= pd version < 1.1 and nullable dtypes, least common case
# raises error because array is casted to type(pd.NA) and there's no na_value argument
data = data.astype(target_dtype, copy=False).values
except ValueError:
# data has nullable dtypes, but we can specify na_value argument and copy will be made
data = data.to_numpy(dtype=target_dtype, na_value=np.nan)
) -> Tuple[np.ndarray, List[str], List[str], List[List]]:
if len(data.shape) != 2 or data.shape[0] < 1:
raise ValueError('Input data must be 2 dimensional and non empty.')

# determine feature names
if feature_name == 'auto':
feature_name = [str(col) for col in data.columns]

# determine categorical features
cat_cols = [col for col, dtype in zip(data.columns, data.dtypes) if isinstance(dtype, pd_CategoricalDtype)]
cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered]
if pandas_categorical is None: # train dataset
pandas_categorical = [list(data[col].cat.categories) for col in cat_cols]
else:
if feature_name == 'auto':
feature_name = None
if categorical_feature == 'auto':
categorical_feature = None
if len(cat_cols) != len(pandas_categorical):
raise ValueError('train and valid dataset categorical_feature do not match.')
for col, category in zip(cat_cols, pandas_categorical):
if list(data[col].cat.categories) != list(category):
data[col] = data[col].cat.set_categories(category)
if len(cat_cols): # cat_cols is list
data = data.copy(deep=False) # not alter origin DataFrame
data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan})
if categorical_feature == 'auto': # use cat cols from DataFrame
categorical_feature = cat_cols_not_ordered
else: # use cat cols specified by user
categorical_feature = list(categorical_feature) # type: ignore[assignment]

# get numpy representation of the data
_check_for_bad_pandas_dtypes(data.dtypes)
df_dtypes = [dtype.type for dtype in data.dtypes]
df_dtypes.append(np.float32) # so that the target dtype considers floats
target_dtype = np.result_type(*df_dtypes)
try:
# most common case (no nullable dtypes)
data = data.to_numpy(dtype=target_dtype, copy=False)
except TypeError:
# 1.0 <= pd version < 1.1 and nullable dtypes, least common case
# raises error because array is casted to type(pd.NA) and there's no na_value argument
data = data.astype(target_dtype, copy=False).values
except ValueError:
# data has nullable dtypes, but we can specify na_value argument and copy will be made
data = data.to_numpy(dtype=target_dtype, na_value=np.nan)
return data, feature_name, categorical_feature, pandas_categorical


Expand Down Expand Up @@ -1004,7 +999,15 @@ def predict(
ctypes.c_int(len(data_names)),
)
)
data = _data_from_pandas(data, None, None, self.pandas_categorical)[0]

if isinstance(data, pd_DataFrame):
data = _data_from_pandas(
data=data,
feature_name="auto",
categorical_feature="auto",
pandas_categorical=self.pandas_categorical
)[0]

predict_type = _C_API_PREDICT_NORMAL
if raw_score:
predict_type = _C_API_PREDICT_RAW_SCORE
Expand Down Expand Up @@ -1854,10 +1857,13 @@ def _lazy_init(
if reference is not None:
self.pandas_categorical = reference.pandas_categorical
categorical_feature = reference.categorical_feature
data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(data=data,
feature_name=feature_name,
categorical_feature=categorical_feature,
pandas_categorical=self.pandas_categorical)
if isinstance(data, pd_DataFrame):
data, feature_name, categorical_feature, self.pandas_categorical = _data_from_pandas(
data=data,
feature_name=feature_name,
categorical_feature=categorical_feature,
pandas_categorical=self.pandas_categorical
)

# process for args
params = {} if params is None else params
Expand All @@ -1867,10 +1873,10 @@ def _lazy_init(
_log_warning(f'{key} keyword has been found in `params` and will be ignored.\n'
f'Please use {key} argument of the Dataset constructor to pass this parameter.')
# get categorical features
if categorical_feature is not None:
if isinstance(categorical_feature, list):
categorical_indices = set()
feature_dict = {}
if feature_name is not None:
if isinstance(feature_name, list):
feature_dict = {name: i for i, name in enumerate(feature_name)}
for name in categorical_feature:
if isinstance(name, str) and name in feature_dict:
Expand Down
4 changes: 2 additions & 2 deletions python-package/lightgbm/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -712,8 +712,8 @@ def create_tree_digraph(
if isinstance(example_case, pd_DataFrame):
example_case = _data_from_pandas(
data=example_case,
feature_name=None,
categorical_feature=None,
feature_name="auto",
categorical_feature="auto",
pandas_categorical=booster.pandas_categorical
)[0]
example_case = example_case[0]
Expand Down
21 changes: 19 additions & 2 deletions tests/python_package_test/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -723,7 +723,12 @@ def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name):
pd = pytest.importorskip('pandas')
X = np.random.rand(10, 2).astype(dtype)
df = pd.DataFrame(X)
built_data = lgb.basic._data_from_pandas(df, feature_name, None, None)[0]
built_data = lgb.basic._data_from_pandas(
data=df,
feature_name=feature_name,
categorical_feature="auto",
pandas_categorical=None
)[0]
assert built_data.dtype == dtype
assert np.shares_memory(X, built_data)

Expand All @@ -734,7 +739,12 @@ def test_categorical_code_conversion_doesnt_modify_original_data(feature_name):
X = np.random.choice(['a', 'b'], 100).reshape(-1, 1)
column_name = 'a' if feature_name == 'auto' else feature_name[0]
df = pd.DataFrame(X.copy(), columns=[column_name], dtype='category')
data = lgb.basic._data_from_pandas(df, feature_name, None, None)[0]
data = lgb.basic._data_from_pandas(
data=df,
feature_name=feature_name,
categorical_feature="auto",
pandas_categorical=None
)[0]
# check that the original data wasn't modified
np.testing.assert_equal(df[column_name], X[:, 0])
# check that the built data has the codes
Expand Down Expand Up @@ -806,3 +816,10 @@ def test_set_leaf_output():
leaf_output = bst.get_leaf_output(tree_id=0, leaf_id=leaf_id)
bst.set_leaf_output(tree_id=0, leaf_id=leaf_id, value=leaf_output + 1)
np.testing.assert_allclose(bst.predict(X), y_pred + 1)


def test_feature_names_are_set_correctly_when_no_feature_names_passed_into_Dataset():
ds = lgb.Dataset(
data=np.random.randn(100, 3),
)
assert ds.construct().feature_name == ["Column_0", "Column_1", "Column_2"]