diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 14d1e1b49a726..a27e6e8433779 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -292,6 +292,9 @@ Other enhancements - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``errors`` argument (:issue:`22610`) - :meth:`groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`). - :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`). +- :func:`concat` and :meth:`~DataFrame.append` now preserve extension dtypes, for example + combining a nullable integer column with a numpy integer column will no longer + result in object dtype but preserve the integer dtype (:issue:`33607`, :issue:`34339`). - :meth:`~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`). - :meth:`~pandas.io.gbq.read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`). - :meth:`DataFrame.to_html` and :meth:`DataFrame.to_string`'s ``col_space`` parameter now accepts a list of dict to change only some specific columns' width (:issue:`28917`). diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index ac06f7cce88d5..df43b5d6115ba 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -93,10 +93,14 @@ def construct_array_type(cls) -> Type["IntegerArray"]: def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: # for now only handle other integer types - if not all(isinstance(t, _IntegerDtype) for t in dtypes): + if not all( + isinstance(t, _IntegerDtype) + or (isinstance(t, np.dtype) and np.issubdtype(t, np.integer)) + for t in dtypes + ): return None np_dtype = np.find_common_type( - [t.numpy_dtype for t in dtypes], [] # type: ignore + [t.numpy_dtype if isinstance(t, BaseMaskedDtype) else t for t in dtypes], [] ) if np.issubdtype(np_dtype, np.integer): return _dtypes[str(np_dtype)] diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index fb47b33ce9890..71686bfc313fb 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -147,7 +147,7 @@ def is_nonempty(x) -> bool: single_dtype = len({x.dtype for x in to_concat}) == 1 any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat) - if any_ea and axis == 0: + if any_ea: if not single_dtype: target_dtype = find_common_type([x.dtype for x in to_concat]) to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] @@ -161,10 +161,6 @@ def is_nonempty(x) -> bool: elif _contains_datetime or "timedelta" in typs: return concat_datetime(to_concat, axis=axis, typs=typs) - elif any_ea and axis == 1: - to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat] - return np.concatenate(to_concat, axis=axis) - elif all_empty: # we have all empties, but may need to coerce the result dtype to # object if we have non-numeric type operands (numpy would otherwise diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index e25c4c2341217..fd8c5f5e27c02 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -319,6 +319,15 @@ def _concatenate_join_units(join_units, concat_axis, copy): concat_values = concat_values.copy() else: concat_values = concat_values.copy() + elif any(isinstance(t, ExtensionArray) for t in to_concat): + # concatting with at least one EA means we are concatting a single column + # the non-EA values are 2D arrays with shape (1, n) + to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat] + concat_values = concat_compat(to_concat, axis=concat_axis) + if not isinstance(concat_values, ExtensionArray): + # if the result of concat is not an EA but an ndarray, reshape to + # 2D to put it a non-EA Block + concat_values = np.atleast_2d(concat_values) else: concat_values = concat_compat(to_concat, axis=concat_axis) @@ -443,7 +452,7 @@ def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool: # cannot necessarily join return ( # all blocks need to have the same type - all(isinstance(ju.block, type(join_units[0].block)) for ju in join_units) + all(type(ju.block) is type(join_units[0].block) for ju in join_units) and # noqa # no blocks that would get missing values (can lead to type upcasts) # unless we're an extension dtype. diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index c9445ceec2c77..cd932e842e00c 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -62,11 +62,11 @@ def test_concat_mixed_dtypes(self, data): self.assert_series_equal(result, expected) # simple test for just EA and one other - result = pd.concat([df1, df2]) + result = pd.concat([df1, df2.astype(object)]) expected = pd.concat([df1.astype("object"), df2.astype("object")]) self.assert_frame_equal(result, expected) - result = pd.concat([df1["A"], df2["A"]]) + result = pd.concat([df1["A"], df2["A"].astype(object)]) expected = pd.concat([df1["A"].astype("object"), df2["A"].astype("object")]) self.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index dd63a26f139e9..5c0230e75021c 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -1006,12 +1006,24 @@ def test_extension_array_cross_section(): def test_extension_array_cross_section_converts(): + # all numeric columns -> numeric series df = pd.DataFrame( - {"A": pd.core.arrays.integer_array([1, 2]), "B": np.array([1, 2])}, + {"A": pd.array([1, 2], dtype="Int64"), "B": np.array([1, 2])}, index=["a", "b"], + ) + result = df.loc["a"] + expected = pd.Series([1, 1], dtype="Int64", index=["A", "B"], name="a") + tm.assert_series_equal(result, expected) + + result = df.iloc[0] + tm.assert_series_equal(result, expected) + + # mixed columns -> object series + df = pd.DataFrame( + {"A": pd.array([1, 2], dtype="Int64"), "B": np.array(["a", "b"])}, index=["a", "b"], ) result = df.loc["a"] - expected = pd.Series([1, 1], dtype=object, index=["A", "B"], name="a") + expected = pd.Series([1, "a"], dtype=object, index=["A", "B"], name="a") tm.assert_series_equal(result, expected) result = df.iloc[0] diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 1c9d00a4b4c90..ffeb5ff0f8aaa 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2843,3 +2843,17 @@ def test_concat_preserves_subclass(obj): result = concat([obj, obj]) assert isinstance(result, type(obj)) + + +def test_concat_frame_axis0_extension_dtypes(): + # preserve extension dtype (through common_dtype mechanism) + df1 = pd.DataFrame({"a": pd.array([1, 2, 3], dtype="Int64")}) + df2 = pd.DataFrame({"a": np.array([4, 5, 6])}) + + result = pd.concat([df1, df2], ignore_index=True) + expected = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6]}, dtype="Int64") + tm.assert_frame_equal(result, expected) + + result = pd.concat([df2, df1], ignore_index=True) + expected = pd.DataFrame({"a": [4, 5, 6, 1, 2, 3]}, dtype="Int64") + tm.assert_frame_equal(result, expected)