ENH: add in extension dtype registry (#21185)

pandas-dev · Jul 3, 2018 · 2f14faf · 2f14faf
1 parent a70e356
commit 2f14faf
Show file tree

Hide file tree

Showing 24 changed files with 302 additions and 82 deletions.
diff --git a/doc/source/extending.rst b/doc/source/extending.rst
@@ -91,8 +91,16 @@ extension array for IP Address data, this might be ``ipaddress.IPv4Address``.
 
 See the `extension dtype source`_ for interface definition.
 
+.. versionadded:: 0.24.0
+
+:class:`pandas.api.extension.ExtensionDtype` can be registered to pandas to allow creation via a string dtype name.
+This allows one to instantiate ``Series`` and ``.astype()`` with a registered string name, for
+example ``'category'`` is a registered string accessor for the ``CategoricalDtype``.
+
+See the `extension dtype dtypes`_ for more on how to register dtypes.
+
 :class:`~pandas.api.extensions.ExtensionArray`
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 This class provides all the array-like functionality. ExtensionArrays are
 limited to 1 dimension. An ExtensionArray is linked to an ExtensionDtype via the
@@ -179,6 +187,7 @@ To use a test, subclass it:
 See https://github.com/pandas-dev/pandas/blob/master/pandas/tests/extension/base/__init__.py
 for a list of all the tests available.
 
+.. _extension dtype dtypes: https://github.com/pandas-dev/pandas/blob/master/pandas/core/dtypes/dtypes.py
 .. _extension dtype source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/dtypes/base.py
 .. _extension array source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/arrays/base.py
 

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -128,6 +128,23 @@ Previous Behavior:
     In [3]: pi - pi[0]
     Out[3]: Int64Index([0, 1, 2], dtype='int64')
 
+.. _whatsnew_0240.api.extension:
+
+ExtensionType Changes
+^^^^^^^^^^^^^^^^^^^^^
+
+- ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore
+  the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`)
+- The ``ExtensionArray`` constructor, ``_from_sequence`` now take the keyword arg ``copy=False`` (:issue:`21185`)
+- Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`)
+- :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`)
+- :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`)
+-
+
+.. _whatsnew_0240.api.other:
+
+Other API Changes
+^^^^^^^^^^^^^^^^^
 
 .. _whatsnew_0240.api.incompatibilities:
 
@@ -168,6 +185,7 @@ Other API Changes
 ^^^^^^^^^^^^^^^^^
 
 - :class:`DatetimeIndex` now accepts :class:`Int64Index` arguments as epoch timestamps (:issue:`20997`)
+- Invalid construction of ``IntervalDtype`` will now always raise a ``TypeError`` rather than a ``ValueError`` if the subdtype is invalid (:issue:`21185`)
 -
 -
 
@@ -344,13 +362,6 @@ Reshaping
 -
 -
 
-ExtensionArray
-^^^^^^^^^^^^^^
-
-- Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`)
-- :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`)
-- :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`)
--
 -
 
 Other

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -154,7 +154,7 @@ def _reconstruct_data(values, dtype, original):
     """
     from pandas import Index
     if is_extension_array_dtype(dtype):
-        pass
+        values = dtype.construct_array_type()._from_sequence(values)
     elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype):
         values = Index(original)._shallow_copy(values, name=None)
     elif is_bool_dtype(dtype):
@@ -705,7 +705,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
 
     else:
 
-        if is_categorical_dtype(values) or is_sparse(values):
+        if is_extension_array_dtype(values) or is_sparse(values):
 
             # handle Categorical and sparse,
             result = Series(values)._values.value_counts(dropna=dropna)

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -54,6 +54,7 @@ class ExtensionArray(object):
     methods:
 
     * fillna
+    * dropna
     * unique
     * factorize / _values_for_factorize
     * argsort / _values_for_argsort
@@ -87,14 +88,16 @@ class ExtensionArray(object):
     # Constructors
     # ------------------------------------------------------------------------
     @classmethod
-    def _from_sequence(cls, scalars):
+    def _from_sequence(cls, scalars, copy=False):
         """Construct a new ExtensionArray from a sequence of scalars.
 
         Parameters
         ----------
         scalars : Sequence
             Each element will be an instance of the scalar type for this
             array, ``cls.dtype.type``.
+        copy : boolean, default False
+            if True, copy the underlying data
         Returns
         -------
         ExtensionArray
@@ -384,6 +387,16 @@ def fillna(self, value=None, method=None, limit=None):
             new_values = self.copy()
         return new_values
 
+    def dropna(self):
+        """ Return ExtensionArray without NA values
+
+        Returns
+        -------
+        valid : ExtensionArray
+        """
+
+        return self[~self.isna()]
+
     def unique(self):
         """Compute the ExtensionArray of unique values.
 

diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py
@@ -109,6 +109,11 @@ class ExtensionDtype(_DtypeOpsMixin):
     * name
     * construct_from_string
 
+    Optionally one can override construct_array_type for construction
+    with the name of this dtype via the Registry
+
+    * construct_array_type
+
     The `na_value` class attribute can be used to set the default NA value
     for this type. :attr:`numpy.nan` is used by default.
 
@@ -156,6 +161,16 @@ def name(self):
         """
         raise AbstractMethodError(self)
 
+    @classmethod
+    def construct_array_type(cls):
+        """Return the array type associated with this dtype
+
+        Returns
+        -------
+        type
+        """
+        raise NotImplementedError
+
     @classmethod
     def construct_from_string(cls, string):
         """Attempt to construct this type from a string.

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -648,6 +648,11 @@ def conv(r, dtype):
 def astype_nansafe(arr, dtype, copy=True):
     """ return a view if copy is False, but
         need to be very careful as the result shape could change! """
+
+    # dispatch on extension dtype if needed
+    if is_extension_array_dtype(dtype):
+        return dtype.array_type._from_sequence(arr, copy=copy)
+
     if not isinstance(dtype, np.dtype):
         dtype = pandas_dtype(dtype)
 

diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -5,10 +5,11 @@
                            PY3, PY36)
 from pandas._libs import algos, lib
 from pandas._libs.tslibs import conversion
+
 from pandas.core.dtypes.dtypes import (
-    CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype,
+    registry, CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype,
     DatetimeTZDtypeType, PeriodDtype, PeriodDtypeType, IntervalDtype,
-    IntervalDtypeType, ExtensionDtype, PandasExtensionDtype)
+    IntervalDtypeType, ExtensionDtype)
 from pandas.core.dtypes.generic import (
     ABCCategorical, ABCPeriodIndex, ABCDatetimeIndex, ABCSeries,
     ABCSparseArray, ABCSparseSeries, ABCCategoricalIndex, ABCIndexClass,
@@ -1977,38 +1978,13 @@ def pandas_dtype(dtype):
     np.dtype or a pandas dtype
     """
 
-    if isinstance(dtype, DatetimeTZDtype):
-        return dtype
-    elif isinstance(dtype, PeriodDtype):
-        return dtype
-    elif isinstance(dtype, CategoricalDtype):
-        return dtype
-    elif isinstance(dtype, IntervalDtype):
-        return dtype
-    elif isinstance(dtype, string_types):
-        try:
-            return DatetimeTZDtype.construct_from_string(dtype)
-        except TypeError:
-            pass
-
-        if dtype.startswith('period[') or dtype.startswith('Period['):
-            # do not parse string like U as period[U]
-            try:
-                return PeriodDtype.construct_from_string(dtype)
-            except TypeError:
-                pass
-
-        elif dtype.startswith('interval') or dtype.startswith('Interval'):
-            try:
-                return IntervalDtype.construct_from_string(dtype)
-            except TypeError:
-                pass
+    # registered extension types
+    result = registry.find(dtype)
+    if result is not None:
+        return result
 
-        try:
-            return CategoricalDtype.construct_from_string(dtype)
-        except TypeError:
-            pass
-    elif isinstance(dtype, (PandasExtensionDtype, ExtensionDtype)):
+    # un-registered extension types
+    if isinstance(dtype, ExtensionDtype):
         return dtype
 
     try:

diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -8,6 +8,65 @@
 from .base import ExtensionDtype, _DtypeOpsMixin
 
 
+class Registry(object):
+    """
+    Registry for dtype inference
+
+    The registry allows one to map a string repr of a extension
+    dtype to an extenstion dtype.
+
+    Multiple extension types can be registered.
+    These are tried in order.
+
+    Examples
+    --------
+    registry.register(MyExtensionDtype)
+    """
+    dtypes = []
+
+    @classmethod
+    def register(self, dtype):
+        """
+        Parameters
+        ----------
+        dtype : ExtensionDtype
+        """
+        if not issubclass(dtype, (PandasExtensionDtype, ExtensionDtype)):
+            raise ValueError("can only register pandas extension dtypes")
+
+        self.dtypes.append(dtype)
+
+    def find(self, dtype):
+        """
+        Parameters
+        ----------
+        dtype : PandasExtensionDtype or string
+
+        Returns
+        -------
+        return the first matching dtype, otherwise return None
+        """
+        if not isinstance(dtype, compat.string_types):
+            dtype_type = dtype
+            if not isinstance(dtype, type):
+                dtype_type = type(dtype)
+            if issubclass(dtype_type, (PandasExtensionDtype, ExtensionDtype)):
+                return dtype
+
+            return None
+
+        for dtype_type in self.dtypes:
+            try:
+                return dtype_type.construct_from_string(dtype)
+            except TypeError:
+                pass
+
+        return None
+
+
+registry = Registry()
+
+
 class PandasExtensionDtype(_DtypeOpsMixin):
     """
     A np.dtype duck-typed class, suitable for holding a custom dtype.
@@ -265,6 +324,17 @@ def _hash_categories(categories, ordered=True):
         else:
             return np.bitwise_xor.reduce(hashed)
 
+    @classmethod
+    def construct_array_type(cls):
+        """Return the array type associated with this dtype
+
+        Returns
+        -------
+        type
+        """
+        from pandas import Categorical
+        return Categorical
+
     @classmethod
     def construct_from_string(cls, string):
         """ attempt to construct this type from a string, raise a TypeError if
@@ -556,11 +626,16 @@ def _parse_dtype_strict(cls, freq):
     @classmethod
     def construct_from_string(cls, string):
         """
-        attempt to construct this type from a string, raise a TypeError
-        if its not possible
+        Strict construction from a string, raise a TypeError if not
+        possible
         """
         from pandas.tseries.offsets import DateOffset
-        if isinstance(string, (compat.string_types, DateOffset)):
+
+        if (isinstance(string, compat.string_types) and
+            (string.startswith('period[') or
+             string.startswith('Period[')) or
+                isinstance(string, DateOffset)):
+            # do not parse string like U as period[U]
             # avoid tuple to be regarded as freq
             try:
                 return cls(freq=string)
@@ -660,7 +735,7 @@ def __new__(cls, subtype=None):
             try:
                 subtype = pandas_dtype(subtype)
             except TypeError:
-                raise ValueError("could not construct IntervalDtype")
+                raise TypeError("could not construct IntervalDtype")
 
         if is_categorical_dtype(subtype) or is_string_dtype(subtype):
             # GH 19016
@@ -682,8 +757,11 @@ def construct_from_string(cls, string):
         attempt to construct this type from a string, raise a TypeError
         if its not possible
         """
-        if isinstance(string, compat.string_types):
+        if (isinstance(string, compat.string_types) and
+            (string.startswith('interval') or
+             string.startswith('Interval'))):
             return cls(string)
+
         msg = "a string needs to be passed, got type {typ}"
         raise TypeError(msg.format(typ=type(string)))
 
@@ -727,3 +805,10 @@ def is_dtype(cls, dtype):
             else:
                 return False
         return super(IntervalDtype, cls).is_dtype(dtype)
+
+
+# register the dtypes in search order
+registry.register(DatetimeTZDtype)
+registry.register(PeriodDtype)
+registry.register(IntervalDtype)
+registry.register(CategoricalDtype)
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
@@ -796,7 +796,7 @@ def astype(self, dtype, copy=True):
     @cache_readonly
     def dtype(self):
         """Return the dtype object of the underlying data"""
-        return IntervalDtype.construct_from_string(str(self.left.dtype))
+        return IntervalDtype(self.left.dtype.name)
 
     @property
     def inferred_type(self):