Skip to content

Commit

Permalink
[python] ignore pandas ordered categorical columns by default (#2115)
Browse files Browse the repository at this point in the history
* ignore pandas ordered categorical columns by default

* fix tests

* fix tests

* added comments
  • Loading branch information
StrikerRUS committed Apr 19, 2019
1 parent 89f2021 commit d115769
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 47 deletions.
18 changes: 9 additions & 9 deletions python-package/lightgbm/basic.py
Expand Up @@ -258,7 +258,8 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
raise ValueError('Input data must be 2 dimensional and non empty.')
if feature_name == 'auto' or feature_name is None:
data = data.rename(columns=str)
cat_cols = data.select_dtypes(include=['category']).columns
cat_cols = list(data.select_dtypes(include=['category']).columns)
cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered]
if pandas_categorical is None: # train dataset
pandas_categorical = [list(data[col].cat.categories) for col in cat_cols]
else:
Expand All @@ -267,26 +268,25 @@ def _data_from_pandas(data, feature_name, categorical_feature, pandas_categorica
for col, category in zip_(cat_cols, pandas_categorical):
if list(data[col].cat.categories) != list(category):
data[col] = data[col].cat.set_categories(category)
if len(cat_cols): # cat_cols is pandas Index object
if len(cat_cols): # cat_cols is list
data = data.copy() # not alter origin DataFrame
data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan})
if categorical_feature is not None:
if feature_name is None:
feature_name = list(data.columns)
if categorical_feature == 'auto':
categorical_feature = list(cat_cols)
categorical_feature = cat_cols_not_ordered
else:
categorical_feature = list(categorical_feature) + list(cat_cols)
categorical_feature = list(categorical_feature) + cat_cols_not_ordered
if feature_name == 'auto':
feature_name = list(data.columns)
data_dtypes = data.dtypes
if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes):
bad_fields = [data.columns[i] for i, dtype in
enumerate(data_dtypes) if dtype.name not in PANDAS_DTYPE_MAPPER]

msg = ("DataFrame.dtypes for data must be int, float or bool.\n"
"Did not expect the data types in fields ")
raise ValueError(msg + ', '.join(bad_fields))
raise ValueError("DataFrame.dtypes for data must be int, float or bool.\n"
"Did not expect the data types in fields "
+ ', '.join(bad_fields))
data = data.values.astype('float')
else:
if feature_name == 'auto':
Expand Down Expand Up @@ -686,7 +686,7 @@ def __init__(self, data, label=None, reference=None,
Categorical features.
If list of int, interpreted as indices.
If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
If 'auto' and data is pandas DataFrame, pandas categorical columns are used.
If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
All values in categorical features should be less than int32 max value (2147483647).
Large values could be memory consuming. Consider using consecutive integers starting from zero.
All negative values in categorical features will be treated as missing values.
Expand Down
4 changes: 2 additions & 2 deletions python-package/lightgbm/engine.py
Expand Up @@ -56,7 +56,7 @@ def train(params, train_set, num_boost_round=100,
Categorical features.
If list of int, interpreted as indices.
If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
If 'auto' and data is pandas DataFrame, pandas categorical columns are used.
If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
All values in categorical features should be less than int32 max value (2147483647).
Large values could be memory consuming. Consider using consecutive integers starting from zero.
All negative values in categorical features will be treated as missing values.
Expand Down Expand Up @@ -391,7 +391,7 @@ def cv(params, train_set, num_boost_round=100,
Categorical features.
If list of int, interpreted as indices.
If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
If 'auto' and data is pandas DataFrame, pandas categorical columns are used.
If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
All values in categorical features should be less than int32 max value (2147483647).
Large values could be memory consuming. Consider using consecutive integers starting from zero.
All negative values in categorical features will be treated as missing values.
Expand Down
2 changes: 1 addition & 1 deletion python-package/lightgbm/sklearn.py
Expand Up @@ -392,7 +392,7 @@ def fit(self, X, y,
Categorical features.
If list of int, interpreted as indices.
If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
If 'auto' and data is pandas DataFrame, pandas categorical columns are used.
If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
All values in categorical features should be less than int32 max value (2147483647).
Large values could be memory consuming. Consider using consecutive integers starting from zero.
All negative values in categorical features will be treated as missing values.
Expand Down
52 changes: 31 additions & 21 deletions tests/python_package_test/test_engine.py
Expand Up @@ -553,39 +553,42 @@ def test_template(init_model=None, return_model=False):
@unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
def test_pandas_categorical(self):
import pandas as pd
np.random.seed(42) # sometimes there is no difference how E col is treated (cat or not cat)
X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75), # str
"B": np.random.permutation([1, 2, 3] * 100), # int
"C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float
"D": np.random.permutation([True, False] * 150)}) # bool
"D": np.random.permutation([True, False] * 150), # bool
"E": pd.Categorical(np.random.permutation(['z', 'y', 'x', 'w', 'v'] * 60),
ordered=True)}) # str and ordered categorical
y = np.random.permutation([0, 1] * 150)
X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20),
X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20), # unseen category
"B": np.random.permutation([1, 3] * 30),
"C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
"D": np.random.permutation([True, False] * 30)})
cat_cols = []
for col in ["A", "B", "C", "D"]:
X[col] = X[col].astype('category')
X_test[col] = X_test[col].astype('category')
cat_cols.append(X[col].cat.categories.tolist())
"D": np.random.permutation([True, False] * 30),
"E": pd.Categorical(pd.np.random.permutation(['z', 'y'] * 30),
ordered=True)})
np.random.seed() # reset seed
cat_cols_actual = ["A", "B", "C", "D"]
cat_cols_to_store = cat_cols_actual + ["E"]
X[cat_cols_actual] = X[cat_cols_actual].astype('category')
X_test[cat_cols_actual] = X_test[cat_cols_actual].astype('category')
cat_values = [X[col].cat.categories.tolist() for col in cat_cols_to_store]
params = {
'objective': 'binary',
'metric': 'binary_logloss',
'verbose': -1
}
lgb_train = lgb.Dataset(X, y)
gbm0 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False)
gbm0 = lgb.train(params, lgb_train, num_boost_round=10)
pred0 = gbm0.predict(X_test)
lgb_train = lgb.Dataset(X, pd.DataFrame(y)) # also test that label can be one-column pd.DataFrame
gbm1 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False,
categorical_feature=[0])
gbm1 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=[0])
pred1 = gbm1.predict(X_test)
lgb_train = lgb.Dataset(X, pd.Series(y)) # also test that label can be pd.Series
gbm2 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False,
categorical_feature=['A'])
gbm2 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=['A'])
pred2 = gbm2.predict(X_test)
lgb_train = lgb.Dataset(X, y)
gbm3 = lgb.train(params, lgb_train, num_boost_round=10, verbose_eval=False,
categorical_feature=['A', 'B', 'C', 'D'])
gbm3 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=['A', 'B', 'C', 'D'])
pred3 = gbm3.predict(X_test)
gbm3.save_model('categorical.model')
gbm4 = lgb.Booster(model_file='categorical.model')
Expand All @@ -595,18 +598,25 @@ def test_pandas_categorical(self):
pred5 = gbm4.predict(X_test)
gbm5 = lgb.Booster(model_str=model_str)
pred6 = gbm5.predict(X_test)
lgb_train = lgb.Dataset(X, y)
gbm6 = lgb.train(params, lgb_train, num_boost_round=10, categorical_feature=['E'])
pred7 = gbm6.predict(X_test)
np.testing.assert_almost_equal(pred0, pred1)
np.testing.assert_almost_equal(pred0, pred2)
np.testing.assert_almost_equal(pred0, pred3)
np.testing.assert_almost_equal(pred0, pred4)
np.testing.assert_almost_equal(pred0, pred5)
np.testing.assert_almost_equal(pred0, pred6)
self.assertListEqual(gbm0.pandas_categorical, cat_cols)
self.assertListEqual(gbm1.pandas_categorical, cat_cols)
self.assertListEqual(gbm2.pandas_categorical, cat_cols)
self.assertListEqual(gbm3.pandas_categorical, cat_cols)
self.assertListEqual(gbm4.pandas_categorical, cat_cols)
self.assertListEqual(gbm5.pandas_categorical, cat_cols)
self.assertRaises(AssertionError,
np.testing.assert_almost_equal,
pred0, pred7) # ordered cat features aren't treated as cat features by default
self.assertListEqual(gbm0.pandas_categorical, cat_values)
self.assertListEqual(gbm1.pandas_categorical, cat_values)
self.assertListEqual(gbm2.pandas_categorical, cat_values)
self.assertListEqual(gbm3.pandas_categorical, cat_values)
self.assertListEqual(gbm4.pandas_categorical, cat_values)
self.assertListEqual(gbm5.pandas_categorical, cat_values)
self.assertListEqual(gbm6.pandas_categorical, cat_values)

def test_reference_chain(self):
X = np.random.normal(size=(100, 2))
Expand Down
40 changes: 26 additions & 14 deletions tests/python_package_test/test_sklearn.py
Expand Up @@ -206,22 +206,29 @@ def test_sklearn_integration(self):
@unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
def test_pandas_categorical(self):
import pandas as pd
np.random.seed(42) # sometimes there is no difference how E col is treated (cat or not cat)
X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75), # str
"B": np.random.permutation([1, 2, 3] * 100), # int
"C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float
"D": np.random.permutation([True, False] * 150)}) # bool
"D": np.random.permutation([True, False] * 150), # bool
"E": pd.Categorical(np.random.permutation(['z', 'y', 'x', 'w', 'v'] * 60),
ordered=True)}) # str and ordered categorical
y = np.random.permutation([0, 1] * 150)
X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20),
X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20), # unseen category
"B": np.random.permutation([1, 3] * 30),
"C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
"D": np.random.permutation([True, False] * 30)})
cat_cols = []
for col in ["A", "B", "C", "D"]:
X[col] = X[col].astype('category')
X_test[col] = X_test[col].astype('category')
cat_cols.append(X[col].cat.categories.tolist())
"D": np.random.permutation([True, False] * 30),
"E": pd.Categorical(pd.np.random.permutation(['z', 'y'] * 30),
ordered=True)})
np.random.seed() # reset seed
cat_cols_actual = ["A", "B", "C", "D"]
cat_cols_to_store = cat_cols_actual + ["E"]
X[cat_cols_actual] = X[cat_cols_actual].astype('category')
X_test[cat_cols_actual] = X_test[cat_cols_actual].astype('category')
cat_values = [X[col].cat.categories.tolist() for col in cat_cols_to_store]
gbm0 = lgb.sklearn.LGBMClassifier().fit(X, y)
pred0 = gbm0.predict(X_test)
pred_prob = gbm0.predict_proba(X_test)[:, 1]
gbm1 = lgb.sklearn.LGBMClassifier().fit(X, pd.Series(y), categorical_feature=[0])
pred1 = gbm1.predict(X_test)
gbm2 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['A'])
Expand All @@ -231,16 +238,21 @@ def test_pandas_categorical(self):
gbm3.booster_.save_model('categorical.model')
gbm4 = lgb.Booster(model_file='categorical.model')
pred4 = gbm4.predict(X_test)
pred_prob = gbm0.predict_proba(X_test)[:, 1]
gbm5 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['E'])
pred5 = gbm5.predict(X_test)
np.testing.assert_almost_equal(pred0, pred1)
np.testing.assert_almost_equal(pred0, pred2)
np.testing.assert_almost_equal(pred0, pred3)
np.testing.assert_almost_equal(pred_prob, pred4)
self.assertListEqual(gbm0.booster_.pandas_categorical, cat_cols)
self.assertListEqual(gbm1.booster_.pandas_categorical, cat_cols)
self.assertListEqual(gbm2.booster_.pandas_categorical, cat_cols)
self.assertListEqual(gbm3.booster_.pandas_categorical, cat_cols)
self.assertListEqual(gbm4.pandas_categorical, cat_cols)
self.assertRaises(AssertionError,
np.testing.assert_almost_equal,
pred0, pred5) # ordered cat features aren't treated as cat features by default
self.assertListEqual(gbm0.booster_.pandas_categorical, cat_values)
self.assertListEqual(gbm1.booster_.pandas_categorical, cat_values)
self.assertListEqual(gbm2.booster_.pandas_categorical, cat_values)
self.assertListEqual(gbm3.booster_.pandas_categorical, cat_values)
self.assertListEqual(gbm4.pandas_categorical, cat_values)
self.assertListEqual(gbm5.booster_.pandas_categorical, cat_values)

def test_predict(self):
iris = load_iris()
Expand Down

0 comments on commit d115769

Please sign in to comment.