remove need for array types

pandas-dev · May 29, 2018 · 7c79ebc · 7c79ebc
1 parent a661c0b
commit 7c79ebc
Show file tree

Hide file tree

Showing 11 changed files with 195 additions and 126 deletions.
diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py
@@ -1,6 +1,4 @@
 from .base import ExtensionArray  # noqa
 from .categorical import Categorical  # noqa
 from .integer import (  # noqa
-    Int8Array, Int16Array, Int32Array, Int64Array,
-    UInt8Array, UInt16Array, UInt32Array, UInt64Array,
-    to_integer_array)
+    IntegerArray, to_integer_array)
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -87,14 +87,16 @@ class ExtensionArray(object):
     # Constructors
     # ------------------------------------------------------------------------
     @classmethod
-    def _from_sequence(cls, scalars, copy=False):
+    def _from_sequence(cls, scalars, dtype=None, copy=False):
         """Construct a new ExtensionArray from a sequence of scalars.
 
         Parameters
         ----------
         scalars : Sequence
             Each element will be an instance of the scalar type for this
             array, ``cls.dtype.type``.
+        dtype : Dtype, optional
+            consruct for this particular dtype
         copy : boolean, default True
             if True, copy the underlying data
         Returns
@@ -377,7 +379,7 @@ def fillna(self, value=None, method=None, limit=None):
                 func = pad_1d if method == 'pad' else backfill_1d
                 new_values = func(self.astype(object), limit=limit,
                                   mask=mask)
-                new_values = self._from_sequence(new_values)
+                new_values = self._from_sequence(new_values, dtype=self.dtype)
             else:
                 # fill with value
                 new_values = self.copy()
@@ -406,7 +408,7 @@ def unique(self):
         from pandas import unique
 
         uniques = unique(self.astype(object))
-        return self._from_sequence(uniques)
+        return self._from_sequence(uniques, dtype=self.dtype)
 
     def _values_for_factorize(self):
         # type: () -> Tuple[ndarray, Any]
@@ -558,7 +560,7 @@ def take(self, indices, allow_fill=False, fill_value=None):
 
                result = take(data, indices, fill_value=fill_value,
                              allow_fill=allow_fill)
-               return self._from_sequence(result)
+               return self._from_sequence(result, dtype=self.dtype)
         """
         # Implementer note: The `fill_value` parameter should be a user-facing
         # value, an instance of self.dtype.type. When passed `fill_value=None`,

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -436,8 +436,8 @@ def _constructor(self):
         return Categorical
 
     @classmethod
-    def _from_sequence(cls, scalars):
-        return Categorical(scalars)
+    def _from_sequence(cls, scalars, dtype=None, copy=False):
+        return Categorical(scalars, dtype=dtype)
 
     def copy(self):
         """ Copy constructor. """

diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -2,7 +2,9 @@
 import warnings
 import numpy as np
 
+from pandas.compat import u
 from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass
+from pandas.util._decorators import cache_readonly
 from pandas.compat import set_function_name
 from pandas.api.types import (is_integer, is_scalar, is_float,
                               is_float_dtype, is_integer_dtype,
@@ -12,21 +14,44 @@
 from pandas.core.dtypes.base import ExtensionDtype
 from pandas.core.dtypes.dtypes import registry
 from pandas.core.dtypes.missing import isna, notna
-
-# available dtypes
-_integer_dtypes = ['int8', 'int16', 'int32', 'int64']
-_integer_formatter = lambda x: x.capitalize()
-_unsigned_dtypes = ['uint8', 'uint16', 'uint32', 'uint64']
-_unsigned_formatter = lambda x: "{}{}".format(x[0].upper(), x[1:].capitalize())
+from pandas.io.formats.printing import (
+    format_object_summary, format_object_attrs, default_pprint)
 
 
 class IntegerDtype(ExtensionDtype):
     type = None
     na_value = np.nan
-    kind = 'i'
-    is_integer = True
-    is_signed_integer = True
-    is_unsigned_integer = False
+
+    @cache_readonly
+    def is_signed_integer(self):
+        return self.kind == 'i'
+
+    @cache_readonly
+    def is_unsigned_integer(self):
+        return self.kind == 'u'
+
+    @cache_readonly
+    def numpy_dtype(self):
+        """ Return an instance of our numpy dtype """
+        return np.dtype(self.type)
+
+    @cache_readonly
+    def kind(self):
+        return self.numpy_dtype.kind
+
+    @classmethod
+    def construct_array_type(cls, array):
+        """Return the array type associated with this dtype
+
+        Parameters
+        ----------
+        array : value array
+
+        Returns
+        -------
+        type
+        """
+        return IntegerArray
 
     @classmethod
     def construct_from_string(cls, string):
@@ -40,12 +65,6 @@ def construct_from_string(cls, string):
                         "'{}'".format(cls, string))
 
 
-class UnsignedIntegerDtype(IntegerDtype):
-    kind = 'u'
-    is_signed_integer = False
-    is_unsigned_integer = True
-
-
 def to_integer_array(values):
     """
     Parameters
@@ -61,13 +80,14 @@ def to_integer_array(values):
     TypeError if incompatible types
     """
     values = np.array(values, copy=False)
-    kind = 'UInt' if values.dtype.kind == 'u' else 'Int'
-    array_type = "{}{}Array".format(kind, values.dtype.itemsize * 8)
     try:
-        array_type = getattr(module, array_type)
-    except AttributeError:
+        dtype = _dtypes[str(values.dtype)]
+    except KeyError:
+        if is_float_dtype(values):
+            return IntegerArray(values)
+
         raise TypeError("Incompatible dtype for {}".format(values.dtype))
-    return array_type(values, copy=False)
+    return IntegerArray(values, dtype=dtype, copy=False)
 
 
 def coerce_to_array(values, dtype, mask=None, copy=False):
@@ -86,6 +106,14 @@ def coerce_to_array(values, dtype, mask=None, copy=False):
     -------
     tuple of (values, mask)
     """
+
+    if isinstance(values, IntegerArray):
+        values, mask = values.data, values.mask
+        if copy:
+            values = values.copy()
+            mask = mask.copy()
+        return values, mask
+
     values = np.array(values, copy=copy)
     if is_object_dtype(values):
         inferred_type = infer_dtype(values)
@@ -112,14 +140,23 @@ def coerce_to_array(values, dtype, mask=None, copy=False):
     if is_object_dtype(values):
         mask |= isna(values)
 
+    # infer dtype if needed
+    if dtype is None:
+        if is_integer_dtype(values):
+            dtype = values.dtype
+        else:
+            dtype = np.dtype('int64')
+    else:
+        dtype = dtype.type
+
     # we copy as need to coerce here
     if mask.any():
         values = values.copy()
         values[mask] = 1
 
-        values = values.astype(dtype.type)
+        values = values.astype(dtype)
     else:
-        values = values.astype(dtype.type, copy=False)
+        values = values.astype(dtype, copy=False)
 
     return values, mask
 
@@ -131,26 +168,30 @@ class IntegerArray(ExtensionArray):
     - mask: a boolean array holding a mask on the data, False is missing
     """
 
-    dtype = None
+    @cache_readonly
+    def dtype(self):
+        return _dtypes[str(self.data.dtype)]
 
-    def __init__(self, values, mask=None, copy=False):
+    def __init__(self, values, mask=None, dtype=None, copy=False):
         self.data, self.mask = coerce_to_array(
-            values, dtype=self.dtype, mask=mask, copy=copy)
+            values, dtype=dtype, mask=mask, copy=copy)
 
     @classmethod
-    def _from_sequence(cls, scalars, mask=None, copy=False):
-        return cls(scalars, mask=mask, copy=copy)
+    def _from_sequence(cls, scalars, mask=None, dtype=None, copy=False):
+        return cls(scalars, mask=mask, dtype=dtype, copy=copy)
 
     @classmethod
     def _from_factorized(cls, values, original):
-        return cls(values)
+        return cls(values, dtype=original.dtype)
 
     def __getitem__(self, item):
         if is_integer(item):
             if self.mask[item]:
                 return self.dtype.na_value
             return self.data[item]
-        return type(self)(self.data[item], mask=self.mask[item])
+        return type(self)(self.data[item],
+                          mask=self.mask[item],
+                          dtype=self.dtype)
 
     def _coerce_to_ndarray(self):
         """ coerce to an ndarary, preserving my scalar types """
@@ -205,12 +246,12 @@ def take(self, indexer, allow_fill=False, fill_value=None):
             result[fill_mask] = fill_value
             mask = mask ^ fill_mask
 
-        return self._from_sequence(result, mask=mask)
+        return type(self)(result, mask=mask, dtype=self.dtype)
 
     def copy(self, deep=False):
         if deep:
             return type(self)(
-                self.data.copy(), mask=self.mask.copy())
+                self.data.copy(), mask=self.mask.copy(), dtype=self.dtype)
         return type(self)(self)
 
     def __setitem__(self, key, value):
@@ -230,11 +271,23 @@ def __len__(self):
         return len(self.data)
 
     def __repr__(self):
+        """
+        Return a string representation for this object.
+
+        Invoked by unicode(df) in py2 only. Yields a Unicode String in both
+        py2/py3.
+        """
+        klass = self.__class__.__name__
+        data = format_object_summary(self, default_pprint, False)
+        attrs = format_object_attrs(self)
+        space = " "
 
-        formatted = self._formatting_values()
-        return '{}({})'.format(
-            self.__class__.__name__,
-            formatted.tolist())
+        prepr = (u(",%s") %
+                 space).join(u("%s=%s") % (k, v) for k, v in attrs)
+
+        res = u("%s(%s%s)") % (klass, data, prepr)
+
+        return res
 
     @property
     def nbytes(self):
@@ -251,7 +304,7 @@ def _na_value(self):
     def _concat_same_type(cls, to_concat):
         data = np.concatenate([x.data for x in to_concat])
         mask = np.concatenate([x.mask for x in to_concat])
-        return cls(data, mask=mask)
+        return cls(data, mask=mask, dtype=to_concat[0].dtype)
 
     def astype(self, dtype, copy=True):
         """Cast to a NumPy array with 'dtype'.
@@ -269,7 +322,22 @@ def astype(self, dtype, copy=True):
         -------
         array : ndarray
             NumPy ndarray with 'dtype' for its dtype.
+
+        Raises
+        ------
+        TypeError
+            if incompatible type with an IntegerDtype, equivalent of same_kind
+            casting
         """
+
+        # if we are astyping to an existing IntegerDtype we can fastpath
+        if isinstance(dtype, IntegerDtype):
+            result = self.data.astype(dtype.numpy_dtype,
+                                      casting='same_kind', copy=False)
+            return type(self)(result, mask=self.mask,
+                              dtype=dtype, copy=False)
+
+        # coerce
         data = self._coerce_to_ndarray()
         return data.astype(dtype=dtype, copy=False)
 
@@ -412,56 +480,37 @@ def integer_arithmetic_method(self, other):
             if is_float_dtype(result):
                 mask |= (result == np.inf) | (result == -np.inf)
 
-            return cls(result, mask=mask)
+            return cls(result, mask=mask, dtype=self.dtype, copy=False)
 
         name = '__{name}__'.format(name=op.__name__)
         return set_function_name(integer_arithmetic_method, name, cls)
 
 
-class UnsignedIntegerArray(IntegerArray):
-    pass
+IntegerArray._add_numeric_methods_binary()
+IntegerArray._add_comparison_methods_binary()
 
 
 module = sys.modules[__name__]
 
 
 # create the Dtype
-types = [(_integer_dtypes, IntegerDtype, _integer_formatter),
-         (_unsigned_dtypes, UnsignedIntegerDtype, _unsigned_formatter)]
-for dtypes, superclass, formatter in types:
+_dtypes = {}
+for dtype in ['int8', 'int16', 'int32', 'int64',
+              'uint8', 'uint16', 'uint32', 'uint64']:
 
-    for dtype in dtypes:
-
-        name = formatter(dtype)
-        classname = "{}Dtype".format(name)
-        attributes_dict = {'type': getattr(np, dtype),
-                           'name': name}
-        dtype_type = type(classname, (superclass, ), attributes_dict)
-        setattr(module, classname, dtype_type)
-
-        # register
-        registry.register(dtype_type)
-
-
-# create the Array
-types = [(_integer_dtypes, IntegerArray, _integer_formatter),
-         (_unsigned_dtypes, UnsignedIntegerArray, _unsigned_formatter)]
-for dtypes, superclass, formatter in types:
-
-    for dtype in dtypes:
-
-        dtype_type = getattr(module, "{}Dtype".format(formatter(dtype)))
-        classname = "{}Array".format(formatter(dtype))
-        attributes_dict = {'dtype': dtype_type()}
-        array_type = type(classname, (superclass, ), attributes_dict)
-        setattr(module, classname, array_type)
-
-        # add ops
-        array_type._add_numeric_methods_binary()
-        array_type._add_comparison_methods_binary()
-
-        # set the Array type on the Dtype
-        dtype_type.array_type = array_type
+    if dtype.startswith('u'):
+        name = "U{}".format(dtype[1:].capitalize())
+    else:
+        name = dtype.capitalize()
+    classname = "{}Dtype".format(name)
+    attributes_dict = {'type': getattr(np, dtype),
+                       'name': name}
+    dtype_type = type(classname, (IntegerDtype, ), attributes_dict)
+    setattr(module, classname, dtype_type)
+
+    # register
+    registry.register(dtype_type)
+    _dtypes[dtype] = dtype_type()
 
 
 def make_data():