Skip to content

Commit

Permalink
BUG Fixes pandas dataframe bug with boolean dtypes (scikit-learn#15797)
Browse files Browse the repository at this point in the history
  • Loading branch information
thomasjpfan authored and ogrisel committed Dec 31, 2019
1 parent 787c12e commit 14b6223
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 2 deletions.
19 changes: 19 additions & 0 deletions doc/whats_new/v0.22.rst
Expand Up @@ -2,6 +2,25 @@

.. currentmodule:: sklearn

.. _changes_0_22_1:

Version 0.22.1
==============

**In Development**

This is a bug-fix release to primarily resolve some packaging issues in version
0.22.0. It also includes minor documentation improvements and some bug fixes.

Changelog
---------

:mod:`sklearn.utils`
....................

- |Fix| :func:`utils.check_array` now correctly converts pandas DataFrame with
boolean columns to floats. :pr:`15797` by `Thomas Fan`_.

.. _changes_0_22:

Version 0.22.0
Expand Down
21 changes: 21 additions & 0 deletions sklearn/utils/tests/test_validation.py
Expand Up @@ -826,6 +826,27 @@ def test_check_dataframe_warns_on_dtype():
assert len(record) == 0


def test_check_dataframe_mixed_float_dtypes():
# pandas dataframe will coerce a boolean into a object, this is a mismatch
# with np.result_type which will return a float
# check_array needs to explicitly check for bool dtype in a dataframe for
# this situation
# https://github.com/scikit-learn/scikit-learn/issues/15787

pd = importorskip("pandas")
df = pd.DataFrame({
'int': [1, 2, 3],
'float': [0, 0.1, 2.1],
'bool': [True, False, True]}, columns=['int', 'float', 'bool'])

array = check_array(df, dtype=(np.float64, np.float32, np.float16))
expected_array = np.array(
[[1.0, 0.0, 1.0],
[2.0, 0.1, 0.0],
[3.0, 2.1, 1.0]], dtype=np.float)
assert_allclose_dense_sparse(array, expected_array)


class DummyMemory:
def cache(self, func):
return func
Expand Down
9 changes: 7 additions & 2 deletions sklearn/utils/validation.py
Expand Up @@ -454,9 +454,14 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True,
# DataFrame), and store them. If not, store None.
dtypes_orig = None
if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'):
dtypes_orig = np.array(array.dtypes)
dtypes_orig = list(array.dtypes)
# pandas boolean dtype __array__ interface coerces bools to objects
for i, dtype_iter in enumerate(dtypes_orig):
if dtype_iter.kind == 'b':
dtypes_orig[i] = np.object

if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig):
dtype_orig = np.result_type(*array.dtypes)
dtype_orig = np.result_type(*dtypes_orig)

if dtype_numeric:
if dtype_orig is not None and dtype_orig.kind == "O":
Expand Down

0 comments on commit 14b6223

Please sign in to comment.