BUG Fixes pandas dataframe bug with boolean dtypes (scikit-learn#15797)

ogrisel · Dec 31, 2019 · 14b6223 · 14b6223
1 parent 787c12e
commit 14b6223
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 2 deletions.
diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
@@ -2,6 +2,25 @@
 
 .. currentmodule:: sklearn
 
+.. _changes_0_22_1:
+
+Version 0.22.1
+==============
+
+**In Development**
+
+This is a bug-fix release to primarily resolve some packaging issues in version
+0.22.0. It also includes minor documentation improvements and some bug fixes.
+
+Changelog
+---------
+
+:mod:`sklearn.utils`
+....................
+
+- |Fix| :func:`utils.check_array` now correctly converts pandas DataFrame with
+  boolean columns to floats. :pr:`15797` by `Thomas Fan`_.
+
 .. _changes_0_22:
 
 Version 0.22.0

diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
@@ -826,6 +826,27 @@ def test_check_dataframe_warns_on_dtype():
     assert len(record) == 0
 
 
+def test_check_dataframe_mixed_float_dtypes():
+    # pandas dataframe will coerce a boolean into a object, this is a mismatch
+    # with np.result_type which will return a float
+    # check_array needs to explicitly check for bool dtype in a dataframe for
+    # this situation
+    # https://github.com/scikit-learn/scikit-learn/issues/15787
+
+    pd = importorskip("pandas")
+    df = pd.DataFrame({
+        'int': [1, 2, 3],
+        'float': [0, 0.1, 2.1],
+        'bool': [True, False, True]}, columns=['int', 'float', 'bool'])
+
+    array = check_array(df, dtype=(np.float64, np.float32, np.float16))
+    expected_array = np.array(
+        [[1.0, 0.0, 1.0],
+         [2.0, 0.1, 0.0],
+         [3.0, 2.1, 1.0]], dtype=np.float)
+    assert_allclose_dense_sparse(array, expected_array)
+
+
 class DummyMemory:
     def cache(self, func):
         return func

diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
@@ -454,9 +454,14 @@ def check_array(array, accept_sparse=False, accept_large_sparse=True,
     # DataFrame), and store them. If not, store None.
     dtypes_orig = None
     if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'):
-        dtypes_orig = np.array(array.dtypes)
+        dtypes_orig = list(array.dtypes)
+        # pandas boolean dtype __array__ interface coerces bools to objects
+        for i, dtype_iter in enumerate(dtypes_orig):
+            if dtype_iter.kind == 'b':
+                dtypes_orig[i] = np.object
+
         if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig):
-            dtype_orig = np.result_type(*array.dtypes)
+            dtype_orig = np.result_type(*dtypes_orig)
 
     if dtype_numeric:
         if dtype_orig is not None and dtype_orig.kind == "O":