Skip to content

Commit

Permalink
Backport PR #51976 on branch 2.0.x (BUG: read_csv for arrow with mism…
Browse files Browse the repository at this point in the history
…atching dtypes does not work) (#51995)

BUG: read_csv for arrow with mismatching dtypes does not work (#51976)

* BUG: read_csv for arrow with mismatching dtypes does not work

* Rename var
  • Loading branch information
phofl committed Mar 15, 2023
1 parent 4eb55ed commit b9dd4fa
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 34 deletions.
42 changes: 8 additions & 34 deletions pandas/io/parsers/c_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,10 @@
is_categorical_dtype,
pandas_dtype,
)
from pandas.core.dtypes.concat import union_categoricals
from pandas.core.dtypes.dtypes import ExtensionDtype
from pandas.core.dtypes.concat import (
concat_compat,
union_categoricals,
)

from pandas.core.indexes.api import ensure_index_from_sequences

Expand Down Expand Up @@ -378,43 +380,15 @@ def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict:
arrs = [chunk.pop(name) for chunk in chunks]
# Check each arr for consistent types.
dtypes = {a.dtype for a in arrs}
# TODO: shouldn't we exclude all EA dtypes here?
numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)}
if len(numpy_dtypes) > 1:
# error: Argument 1 to "find_common_type" has incompatible type
# "Set[Any]"; expected "Sequence[Union[dtype[Any], None, type,
# _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any,
# Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]]"
common_type = np.find_common_type(
numpy_dtypes, # type: ignore[arg-type]
[],
)
if common_type == np.dtype(object):
warning_columns.append(str(name))
non_cat_dtypes = {x for x in dtypes if not is_categorical_dtype(x)}

dtype = dtypes.pop()
if is_categorical_dtype(dtype):
result[name] = union_categoricals(arrs, sort_categories=False)
else:
if isinstance(dtype, ExtensionDtype):
# TODO: concat_compat?
array_type = dtype.construct_array_type()
# error: Argument 1 to "_concat_same_type" of "ExtensionArray"
# has incompatible type "List[Union[ExtensionArray, ndarray]]";
# expected "Sequence[ExtensionArray]"
result[name] = array_type._concat_same_type(
arrs # type: ignore[arg-type]
)
else:
# error: Argument 1 to "concatenate" has incompatible
# type "List[Union[ExtensionArray, ndarray[Any, Any]]]"
# ; expected "Union[_SupportsArray[dtype[Any]],
# Sequence[_SupportsArray[dtype[Any]]],
# Sequence[Sequence[_SupportsArray[dtype[Any]]]],
# Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]]
# , Sequence[Sequence[Sequence[Sequence[
# _SupportsArray[dtype[Any]]]]]]]"
result[name] = np.concatenate(arrs) # type: ignore[arg-type]
result[name] = concat_compat(arrs)
if len(non_cat_dtypes) > 1 and result[name].dtype == np.dtype(object):
warning_columns.append(str(name))

if warning_columns:
warning_names = ",".join(warning_columns)
Expand Down
36 changes: 36 additions & 0 deletions pandas/tests/io/parser/test_concatenate_chunks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import numpy as np
import pytest

from pandas.errors import DtypeWarning

import pandas._testing as tm
from pandas.core.arrays import ArrowExtensionArray

from pandas.io.parsers.c_parser_wrapper import _concatenate_chunks


def test_concatenate_chunks_pyarrow():
# GH#51876
pa = pytest.importorskip("pyarrow")
chunks = [
{0: ArrowExtensionArray(pa.array([1.5, 2.5]))},
{0: ArrowExtensionArray(pa.array([1, 2]))},
]
result = _concatenate_chunks(chunks)
expected = ArrowExtensionArray(pa.array([1.5, 2.5, 1.0, 2.0]))
tm.assert_extension_array_equal(result[0], expected)


def test_concatenate_chunks_pyarrow_strings():
# GH#51876
pa = pytest.importorskip("pyarrow")
chunks = [
{0: ArrowExtensionArray(pa.array([1.5, 2.5]))},
{0: ArrowExtensionArray(pa.array(["a", "b"]))},
]
with tm.assert_produces_warning(DtypeWarning, match="have mixed types"):
result = _concatenate_chunks(chunks)
expected = np.concatenate(
[np.array([1.5, 2.5], dtype=object), np.array(["a", "b"])]
)
tm.assert_numpy_array_equal(result[0], expected)

0 comments on commit b9dd4fa

Please sign in to comment.