Skip to content

Commit

Permalink
BUG: Fix concat of frames with extension types (no reindexed columns) (
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche committed Jun 19, 2020
1 parent e6e0889 commit f64df3a
Show file tree
Hide file tree
Showing 7 changed files with 50 additions and 12 deletions.
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,9 @@ Other enhancements
- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``errors`` argument (:issue:`22610`)
- :meth:`groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`).
- :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`).
- :func:`concat` and :meth:`~DataFrame.append` now preserve extension dtypes, for example
combining a nullable integer column with a numpy integer column will no longer
result in object dtype but preserve the integer dtype (:issue:`33607`, :issue:`34339`).
- :meth:`~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`).
- :meth:`~pandas.io.gbq.read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`).
- :meth:`DataFrame.to_html` and :meth:`DataFrame.to_string`'s ``col_space`` parameter now accepts a list of dict to change only some specific columns' width (:issue:`28917`).
Expand Down
8 changes: 6 additions & 2 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,14 @@ def construct_array_type(cls) -> Type["IntegerArray"]:

def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
# for now only handle other integer types
if not all(isinstance(t, _IntegerDtype) for t in dtypes):
if not all(
isinstance(t, _IntegerDtype)
or (isinstance(t, np.dtype) and np.issubdtype(t, np.integer))
for t in dtypes
):
return None
np_dtype = np.find_common_type(
[t.numpy_dtype for t in dtypes], [] # type: ignore
[t.numpy_dtype if isinstance(t, BaseMaskedDtype) else t for t in dtypes], []
)
if np.issubdtype(np_dtype, np.integer):
return _dtypes[str(np_dtype)]
Expand Down
6 changes: 1 addition & 5 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def is_nonempty(x) -> bool:
single_dtype = len({x.dtype for x in to_concat}) == 1
any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat)

if any_ea and axis == 0:
if any_ea:
if not single_dtype:
target_dtype = find_common_type([x.dtype for x in to_concat])
to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat]
Expand All @@ -161,10 +161,6 @@ def is_nonempty(x) -> bool:
elif _contains_datetime or "timedelta" in typs:
return concat_datetime(to_concat, axis=axis, typs=typs)

elif any_ea and axis == 1:
to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat]
return np.concatenate(to_concat, axis=axis)

elif all_empty:
# we have all empties, but may need to coerce the result dtype to
# object if we have non-numeric type operands (numpy would otherwise
Expand Down
11 changes: 10 additions & 1 deletion pandas/core/internals/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,15 @@ def _concatenate_join_units(join_units, concat_axis, copy):
concat_values = concat_values.copy()
else:
concat_values = concat_values.copy()
elif any(isinstance(t, ExtensionArray) for t in to_concat):
# concatting with at least one EA means we are concatting a single column
# the non-EA values are 2D arrays with shape (1, n)
to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat]
concat_values = concat_compat(to_concat, axis=concat_axis)
if not isinstance(concat_values, ExtensionArray):
# if the result of concat is not an EA but an ndarray, reshape to
# 2D to put it a non-EA Block
concat_values = np.atleast_2d(concat_values)
else:
concat_values = concat_compat(to_concat, axis=concat_axis)

Expand Down Expand Up @@ -443,7 +452,7 @@ def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool:
# cannot necessarily join
return (
# all blocks need to have the same type
all(isinstance(ju.block, type(join_units[0].block)) for ju in join_units)
all(type(ju.block) is type(join_units[0].block) for ju in join_units)
and # noqa
# no blocks that would get missing values (can lead to type upcasts)
# unless we're an extension dtype.
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/extension/base/reshaping.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,11 @@ def test_concat_mixed_dtypes(self, data):
self.assert_series_equal(result, expected)

# simple test for just EA and one other
result = pd.concat([df1, df2])
result = pd.concat([df1, df2.astype(object)])
expected = pd.concat([df1.astype("object"), df2.astype("object")])
self.assert_frame_equal(result, expected)

result = pd.concat([df1["A"], df2["A"]])
result = pd.concat([df1["A"], df2["A"].astype(object)])
expected = pd.concat([df1["A"].astype("object"), df2["A"].astype("object")])
self.assert_series_equal(result, expected)

Expand Down
16 changes: 14 additions & 2 deletions pandas/tests/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1006,12 +1006,24 @@ def test_extension_array_cross_section():


def test_extension_array_cross_section_converts():
# all numeric columns -> numeric series
df = pd.DataFrame(
{"A": pd.core.arrays.integer_array([1, 2]), "B": np.array([1, 2])},
{"A": pd.array([1, 2], dtype="Int64"), "B": np.array([1, 2])}, index=["a", "b"],
)
result = df.loc["a"]
expected = pd.Series([1, 1], dtype="Int64", index=["A", "B"], name="a")
tm.assert_series_equal(result, expected)

result = df.iloc[0]
tm.assert_series_equal(result, expected)

# mixed columns -> object series
df = pd.DataFrame(
{"A": pd.array([1, 2], dtype="Int64"), "B": np.array(["a", "b"])},
index=["a", "b"],
)
result = df.loc["a"]
expected = pd.Series([1, 1], dtype=object, index=["A", "B"], name="a")
expected = pd.Series([1, "a"], dtype=object, index=["A", "B"], name="a")
tm.assert_series_equal(result, expected)

result = df.iloc[0]
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/reshape/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -2843,3 +2843,17 @@ def test_concat_preserves_subclass(obj):

result = concat([obj, obj])
assert isinstance(result, type(obj))


def test_concat_frame_axis0_extension_dtypes():
# preserve extension dtype (through common_dtype mechanism)
df1 = pd.DataFrame({"a": pd.array([1, 2, 3], dtype="Int64")})
df2 = pd.DataFrame({"a": np.array([4, 5, 6])})

result = pd.concat([df1, df2], ignore_index=True)
expected = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6]}, dtype="Int64")
tm.assert_frame_equal(result, expected)

result = pd.concat([df2, df1], ignore_index=True)
expected = pd.DataFrame({"a": [4, 5, 6, 1, 2, 3]}, dtype="Int64")
tm.assert_frame_equal(result, expected)

0 comments on commit f64df3a

Please sign in to comment.