From ed56aa39b05434f0650f7cc9631cb84604e044f3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 4 Oct 2018 15:07:32 -0500 Subject: [PATCH 1/8] API: ExtensionDtype Equality and Hashability Closes https://github.com/pandas-dev/pandas/issues/22476 --- doc/source/whatsnew/v0.24.0.txt | 9 +++++ pandas/core/dtypes/base.py | 45 +++++++++++++++++++++---- pandas/core/dtypes/dtypes.py | 9 +++-- pandas/tests/extension/base/dtype.py | 7 ++++ pandas/tests/extension/decimal/array.py | 6 +--- pandas/tests/extension/json/array.py | 1 + 6 files changed, 60 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index c9874b4dd03d6..437fd985820e1 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -492,6 +492,15 @@ Previous Behavior: ExtensionType Changes ^^^^^^^^^^^^^^^^^^^^^ +**:class:`pandas.api.extensions.ExtensionDtype` Equality and Hashability** + +Pandas now requires that extension dtypes be hashable. The base class implements +a default ``__eq__`` and ``__hash__``. If you have a parametrized dtype, you should +update the ``ExtensionDtype._metadata`` tuple to match the signature of your +``__init__`` method. See :class:`pandas.api.extensions.ExtensionDtype` for more. + +**Other changes** + - ``ExtensionArray`` has gained the abstract methods ``.dropna()`` (:issue:`21185`) - ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index b0fa55e346613..ac4d6d1590f38 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -22,14 +22,17 @@ class _DtypeOpsMixin(object): # of the NA value, not the physical NA vaalue for storage. # e.g. for JSONArray, this is an empty dictionary. na_value = np.nan + _metadata = () def __eq__(self, other): """Check whether 'other' is equal to self. - By default, 'other' is considered equal if + By default, 'other' is considered equal if either * it's a string matching 'self.name'. - * it's an instance of this type. + * it's an instance of this type and all of the + the attributes in ``self._metadata`` are equal between + `self` and `other`. Parameters ---------- @@ -40,11 +43,19 @@ def __eq__(self, other): bool """ if isinstance(other, compat.string_types): - return other == self.name - elif isinstance(other, type(self)): - return True - else: - return False + try: + other = self.construct_from_string(other) + except TypeError: + return False + if isinstance(other, type(self)): + return all( + getattr(self, attr) == getattr(other, attr) + for attr in self._metadata + ) + return False + + def __hash__(self): + return hash(tuple(getattr(self, attr) for attr in self._metadata)) def __ne__(self, other): return not self.__eq__(other) @@ -161,6 +172,26 @@ class ExtensionDtype(_DtypeOpsMixin): The `na_value` class attribute can be used to set the default NA value for this type. :attr:`numpy.nan` is used by default. + ExtensionDtypes are required to be hashable. The base class provides + a default implementation, which relies on the ``_metadata`` class + attribute. ``_metadata`` should be a tuple containing the strings + that define your data type. For example, with ``PeriodDtype`` that's + the ``freq`` attribute. + + **If you have a parametrized dtype you should set the ``_metadata`` + class property**. + + Ideally, the attributes in ``_metadata`` will match the + parameters to your ``ExtensionDtype.__init__`` (if any). If any of + the attributes in ``_metadata`` don't implement the standard + ``__eq__`` or ``__hash__``, the default implementations here will not + work. + + .. versionchanged:: 0.24.0 + + Added ``_metadata``, ``__hash__``, and changed the default definition + of ``__eq__``. + This class does not inherit from 'abc.ABCMeta' for performance reasons. Methods and properties required by the interface raise ``pandas.errors.AbstractMethodError`` and no ``register`` method is diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index beda9bc02f4d5..611cae28877c3 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -101,7 +101,6 @@ class PandasExtensionDtype(_DtypeOpsMixin): base = None isbuiltin = 0 isnative = 0 - _metadata = [] _cache = {} def __unicode__(self): @@ -209,7 +208,7 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): kind = 'O' str = '|O08' base = np.dtype('O') - _metadata = ['categories', 'ordered'] + _metadata = ('categories', 'ordered') _cache = {} def __init__(self, categories=None, ordered=None): @@ -485,7 +484,7 @@ class DatetimeTZDtype(PandasExtensionDtype): str = '|M8[ns]' num = 101 base = np.dtype('M8[ns]') - _metadata = ['unit', 'tz'] + _metadata = ('unit', 'tz') _match = re.compile(r"(datetime64|M8)\[(?P.+), (?P.+)\]") _cache = {} @@ -589,7 +588,7 @@ class PeriodDtype(PandasExtensionDtype): str = '|O08' base = np.dtype('O') num = 102 - _metadata = ['freq'] + _metadata = ('freq',) _match = re.compile(r"(P|p)eriod\[(?P.+)\]") _cache = {} @@ -709,7 +708,7 @@ class IntervalDtype(PandasExtensionDtype, ExtensionDtype): str = '|O08' base = np.dtype('O') num = 103 - _metadata = ['subtype'] + _metadata = ('subtype',) _match = re.compile(r"(I|i)nterval\[(?P.+)\]") _cache = {} diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index 8d1f1cadcc23f..d5cf9571e3622 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -49,6 +49,10 @@ def test_eq_with_str(self, dtype): def test_eq_with_numpy_object(self, dtype): assert dtype != np.dtype('object') + def test_eq_with_self(self, dtype): + assert dtype == dtype + assert dtype != object() + def test_array_type(self, data, dtype): assert dtype.construct_array_type() is type(data) @@ -81,3 +85,6 @@ def test_check_dtype(self, data): index=list('ABCD')) result = df.dtypes.apply(str) == str(dtype) self.assert_series_equal(result, expected) + + def test_hashable(self, dtype): + hash(dtype) # no error diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 79e1a692f744a..a1ee3a4fefef2 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -15,15 +15,11 @@ class DecimalDtype(ExtensionDtype): type = decimal.Decimal name = 'decimal' na_value = decimal.Decimal('NaN') + _metadata = ('context',) def __init__(self, context=None): self.context = context or decimal.getcontext() - def __eq__(self, other): - if isinstance(other, type(self)): - return self.context == other.context - return super(DecimalDtype, self).__eq__(other) - def __repr__(self): return 'DecimalDtype(context={})'.format(self.context) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 87876d84bef99..976511941042d 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -27,6 +27,7 @@ class JSONDtype(ExtensionDtype): type = compat.Mapping name = 'json' + try: na_value = collections.UserDict() except AttributeError: From 463c0bd6f4a4011205f599a8f2323d559a8f7b2e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 4 Oct 2018 15:46:01 -0500 Subject: [PATCH 2/8] issue note --- doc/source/whatsnew/v0.24.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 437fd985820e1..a1467cbca963a 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -497,7 +497,7 @@ ExtensionType Changes Pandas now requires that extension dtypes be hashable. The base class implements a default ``__eq__`` and ``__hash__``. If you have a parametrized dtype, you should update the ``ExtensionDtype._metadata`` tuple to match the signature of your -``__init__`` method. See :class:`pandas.api.extensions.ExtensionDtype` for more. +``__init__`` method. See :class:`pandas.api.extensions.ExtensionDtype` for more (:issue:`22476`). **Other changes** From 2a1660c77e8ad0c28485c6d81227ac9d6b6de4f2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 4 Oct 2018 14:13:51 -0500 Subject: [PATCH 3/8] BUG: concat different EAs --- doc/source/whatsnew/v0.24.0.txt | 1 + pandas/core/internals/managers.py | 3 +-- pandas/tests/reshape/test_concat.py | 13 +++++++++++++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index a1467cbca963a..1f42cc3621c0d 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -514,6 +514,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your - :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`). - Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) - Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`) +- Bug in concatenation an Series with two different extension dtypes not casting to object dtype (:issue:`22994`) - Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`) .. _whatsnew_0240.api.incompatibilities: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 2f29f1ae2509f..428b12def7962 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1634,8 +1634,7 @@ def concat(self, to_concat, new_axis): # check if all series are of the same block type: if len(non_empties) > 0: blocks = [obj.blocks[0] for obj in non_empties] - - if all(type(b) is type(blocks[0]) for b in blocks[1:]): # noqa + if len({b.dtype for b in blocks}) == 1: new_block = blocks[0].concat_same_type(blocks) else: values = [x.values for x in blocks] diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 2aaa04d571e69..841e33fbe9524 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1,6 +1,7 @@ from warnings import catch_warnings, simplefilter from itertools import combinations from collections import deque +from decimal import Decimal import datetime as dt import dateutil @@ -19,6 +20,7 @@ from pandas.util import testing as tm from pandas.util.testing import (assert_frame_equal, makeCustomDataframe as mkdf) +from pandas.tests.extension.decimal import to_decimal import pytest @@ -2361,6 +2363,17 @@ def test_concat_datetime_timezone(self): index=idx1.append(idx1)) tm.assert_frame_equal(result, expected) + def test_concat_different_extension_dtypes_upcasts(self): + a = pd.Series(pd.core.arrays.integer_array([1, 2])) + b = pd.Series(to_decimal([1, 2])) + + result = pd.concat([a, b], ignore_index=True) + expected = pd.Series([ + 1, 2, + Decimal(1), Decimal(2) + ], dtype=object) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize('pdt', [pd.Series, pd.DataFrame, pd.Panel]) @pytest.mark.parametrize('dt', np.sctypes['float']) From 869e2bfdf3a7ce80de7093387542f0d0e8557b08 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 14 Oct 2018 16:52:14 -0500 Subject: [PATCH 4/8] Allow multiple fill values --- pandas/core/dtypes/concat.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index ac824708245d2..2b1778e5bcb2e 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -560,11 +560,6 @@ def _concat_sparse(to_concat, axis=0, typs=None): fill_values = [x.fill_value for x in to_concat if isinstance(x, SparseArray)] - - if len(set(fill_values)) > 1: - raise ValueError("Cannot concatenate SparseArrays with different " - "fill values") - fill_value = fill_values[0] # TODO: Fix join unit generation so we aren't passed this. From b6ccab469c3e82db20c839a556d67cfbe1b27f43 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 15 Oct 2018 06:37:47 -0500 Subject: [PATCH 5/8] skips --- pandas/tests/extension/decimal/test_decimal.py | 10 +++++++--- pandas/tests/reshape/test_concat.py | 3 ++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index f84d24295b049..be1c61166e4b1 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -100,7 +100,9 @@ def test_hashable(self, dtype): class TestInterface(BaseDecimal, base.BaseInterfaceTests): - pass + + pytestmark = pytest.mark.skipif(compat.PY2, + reason="Unhashble dtype in Py2.") class TestConstructors(BaseDecimal, base.BaseConstructorsTests): @@ -112,7 +114,8 @@ def test_from_dtype(self, data): class TestReshaping(BaseDecimal, base.BaseReshapingTests): - pass + pytestmark = pytest.mark.skipif(compat.PY2, + reason="Unhashble dtype in Py2.") class TestGetitem(BaseDecimal, base.BaseGetitemTests): @@ -174,7 +177,8 @@ class TestCasting(BaseDecimal, base.BaseCastingTests): class TestGroupby(BaseDecimal, base.BaseGroupbyTests): - pass + pytestmark = pytest.mark.skipif(compat.PY2, + reason="Unhashble dtype in Py2.") class TestSetitem(BaseDecimal, base.BaseSetitemTests): diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 841e33fbe9524..e2b44852d218f 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -15,7 +15,7 @@ read_csv, isna, Series, date_range, Index, Panel, MultiIndex, Timestamp, DatetimeIndex, Categorical) -from pandas.compat import Iterable +from pandas.compat import Iterable, PY2 from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.util import testing as tm from pandas.util.testing import (assert_frame_equal, @@ -2363,6 +2363,7 @@ def test_concat_datetime_timezone(self): index=idx1.append(idx1)) tm.assert_frame_equal(result, expected) + @pytest.mark.skipif(PY2, reason="Unhashable Decimal dtype") def test_concat_different_extension_dtypes_upcasts(self): a = pd.Series(pd.core.arrays.integer_array([1, 2])) b = pd.Series(to_decimal([1, 2])) From 9507a0ce9ea813d4d14d8859285f9f8f368abed4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 15 Oct 2018 07:00:15 -0500 Subject: [PATCH 6/8] fixup [ci skip] --- doc/source/whatsnew/v0.24.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 0c3137b2400bd..3d7e7686b2db6 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -602,7 +602,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your - :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`). - Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) - Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`) -- Bug in concatenation an Series with two different extension dtypes not casting to object dtype (:issue:`22994`) +- Bug when concatenating multiple ``Series`` with different extension dtypes not casting to object dtype (:issue:`22994`) - Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`) - Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`) - :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`). From 5502bb8233c71f56e4e92554596172bc2942912e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 15 Oct 2018 07:00:48 -0500 Subject: [PATCH 7/8] trigger CI From e17d3976fc9d6f37822c4f78b0933de4a4964b9a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 15 Oct 2018 10:42:18 -0500 Subject: [PATCH 8/8] lint --- pandas/tests/reshape/test_concat.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index e2b44852d218f..d39c9fafe5749 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -9,13 +9,12 @@ from numpy.random import randn from datetime import datetime -from pandas.compat import StringIO, iteritems, PY2 +from pandas.compat import Iterable, StringIO, iteritems, PY2 import pandas as pd from pandas import (DataFrame, concat, read_csv, isna, Series, date_range, Index, Panel, MultiIndex, Timestamp, DatetimeIndex, Categorical) -from pandas.compat import Iterable, PY2 from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.util import testing as tm from pandas.util.testing import (assert_frame_equal,