Skip to content

Commit

Permalink
remove need for array types
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback committed May 29, 2018
1 parent a661c0b commit 7c79ebc
Show file tree
Hide file tree
Showing 11 changed files with 195 additions and 126 deletions.
4 changes: 1 addition & 3 deletions pandas/core/arrays/__init__.py
@@ -1,6 +1,4 @@
from .base import ExtensionArray # noqa
from .categorical import Categorical # noqa
from .integer import ( # noqa
Int8Array, Int16Array, Int32Array, Int64Array,
UInt8Array, UInt16Array, UInt32Array, UInt64Array,
to_integer_array)
IntegerArray, to_integer_array)
10 changes: 6 additions & 4 deletions pandas/core/arrays/base.py
Expand Up @@ -87,14 +87,16 @@ class ExtensionArray(object):
# Constructors
# ------------------------------------------------------------------------
@classmethod
def _from_sequence(cls, scalars, copy=False):
def _from_sequence(cls, scalars, dtype=None, copy=False):
"""Construct a new ExtensionArray from a sequence of scalars.
Parameters
----------
scalars : Sequence
Each element will be an instance of the scalar type for this
array, ``cls.dtype.type``.
dtype : Dtype, optional
consruct for this particular dtype
copy : boolean, default True
if True, copy the underlying data
Returns
Expand Down Expand Up @@ -377,7 +379,7 @@ def fillna(self, value=None, method=None, limit=None):
func = pad_1d if method == 'pad' else backfill_1d
new_values = func(self.astype(object), limit=limit,
mask=mask)
new_values = self._from_sequence(new_values)
new_values = self._from_sequence(new_values, dtype=self.dtype)
else:
# fill with value
new_values = self.copy()
Expand Down Expand Up @@ -406,7 +408,7 @@ def unique(self):
from pandas import unique

uniques = unique(self.astype(object))
return self._from_sequence(uniques)
return self._from_sequence(uniques, dtype=self.dtype)

def _values_for_factorize(self):
# type: () -> Tuple[ndarray, Any]
Expand Down Expand Up @@ -558,7 +560,7 @@ def take(self, indices, allow_fill=False, fill_value=None):
result = take(data, indices, fill_value=fill_value,
allow_fill=allow_fill)
return self._from_sequence(result)
return self._from_sequence(result, dtype=self.dtype)
"""
# Implementer note: The `fill_value` parameter should be a user-facing
# value, an instance of self.dtype.type. When passed `fill_value=None`,
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/arrays/categorical.py
Expand Up @@ -436,8 +436,8 @@ def _constructor(self):
return Categorical

@classmethod
def _from_sequence(cls, scalars):
return Categorical(scalars)
def _from_sequence(cls, scalars, dtype=None, copy=False):
return Categorical(scalars, dtype=dtype)

def copy(self):
""" Copy constructor. """
Expand Down
199 changes: 124 additions & 75 deletions pandas/core/arrays/integer.py
Expand Up @@ -2,7 +2,9 @@
import warnings
import numpy as np

from pandas.compat import u
from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass
from pandas.util._decorators import cache_readonly
from pandas.compat import set_function_name
from pandas.api.types import (is_integer, is_scalar, is_float,
is_float_dtype, is_integer_dtype,
Expand All @@ -12,21 +14,44 @@
from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.dtypes import registry
from pandas.core.dtypes.missing import isna, notna

# available dtypes
_integer_dtypes = ['int8', 'int16', 'int32', 'int64']
_integer_formatter = lambda x: x.capitalize()
_unsigned_dtypes = ['uint8', 'uint16', 'uint32', 'uint64']
_unsigned_formatter = lambda x: "{}{}".format(x[0].upper(), x[1:].capitalize())
from pandas.io.formats.printing import (
format_object_summary, format_object_attrs, default_pprint)


class IntegerDtype(ExtensionDtype):
type = None
na_value = np.nan
kind = 'i'
is_integer = True
is_signed_integer = True
is_unsigned_integer = False

@cache_readonly
def is_signed_integer(self):
return self.kind == 'i'

@cache_readonly
def is_unsigned_integer(self):
return self.kind == 'u'

@cache_readonly
def numpy_dtype(self):
""" Return an instance of our numpy dtype """
return np.dtype(self.type)

@cache_readonly
def kind(self):
return self.numpy_dtype.kind

@classmethod
def construct_array_type(cls, array):
"""Return the array type associated with this dtype
Parameters
----------
array : value array
Returns
-------
type
"""
return IntegerArray

@classmethod
def construct_from_string(cls, string):
Expand All @@ -40,12 +65,6 @@ def construct_from_string(cls, string):
"'{}'".format(cls, string))


class UnsignedIntegerDtype(IntegerDtype):
kind = 'u'
is_signed_integer = False
is_unsigned_integer = True


def to_integer_array(values):
"""
Parameters
Expand All @@ -61,13 +80,14 @@ def to_integer_array(values):
TypeError if incompatible types
"""
values = np.array(values, copy=False)
kind = 'UInt' if values.dtype.kind == 'u' else 'Int'
array_type = "{}{}Array".format(kind, values.dtype.itemsize * 8)
try:
array_type = getattr(module, array_type)
except AttributeError:
dtype = _dtypes[str(values.dtype)]
except KeyError:
if is_float_dtype(values):
return IntegerArray(values)

raise TypeError("Incompatible dtype for {}".format(values.dtype))
return array_type(values, copy=False)
return IntegerArray(values, dtype=dtype, copy=False)


def coerce_to_array(values, dtype, mask=None, copy=False):
Expand All @@ -86,6 +106,14 @@ def coerce_to_array(values, dtype, mask=None, copy=False):
-------
tuple of (values, mask)
"""

if isinstance(values, IntegerArray):
values, mask = values.data, values.mask
if copy:
values = values.copy()
mask = mask.copy()
return values, mask

values = np.array(values, copy=copy)
if is_object_dtype(values):
inferred_type = infer_dtype(values)
Expand All @@ -112,14 +140,23 @@ def coerce_to_array(values, dtype, mask=None, copy=False):
if is_object_dtype(values):
mask |= isna(values)

# infer dtype if needed
if dtype is None:
if is_integer_dtype(values):
dtype = values.dtype
else:
dtype = np.dtype('int64')
else:
dtype = dtype.type

# we copy as need to coerce here
if mask.any():
values = values.copy()
values[mask] = 1

values = values.astype(dtype.type)
values = values.astype(dtype)
else:
values = values.astype(dtype.type, copy=False)
values = values.astype(dtype, copy=False)

return values, mask

Expand All @@ -131,26 +168,30 @@ class IntegerArray(ExtensionArray):
- mask: a boolean array holding a mask on the data, False is missing
"""

dtype = None
@cache_readonly
def dtype(self):
return _dtypes[str(self.data.dtype)]

def __init__(self, values, mask=None, copy=False):
def __init__(self, values, mask=None, dtype=None, copy=False):
self.data, self.mask = coerce_to_array(
values, dtype=self.dtype, mask=mask, copy=copy)
values, dtype=dtype, mask=mask, copy=copy)

@classmethod
def _from_sequence(cls, scalars, mask=None, copy=False):
return cls(scalars, mask=mask, copy=copy)
def _from_sequence(cls, scalars, mask=None, dtype=None, copy=False):
return cls(scalars, mask=mask, dtype=dtype, copy=copy)

@classmethod
def _from_factorized(cls, values, original):
return cls(values)
return cls(values, dtype=original.dtype)

def __getitem__(self, item):
if is_integer(item):
if self.mask[item]:
return self.dtype.na_value
return self.data[item]
return type(self)(self.data[item], mask=self.mask[item])
return type(self)(self.data[item],
mask=self.mask[item],
dtype=self.dtype)

def _coerce_to_ndarray(self):
""" coerce to an ndarary, preserving my scalar types """
Expand Down Expand Up @@ -205,12 +246,12 @@ def take(self, indexer, allow_fill=False, fill_value=None):
result[fill_mask] = fill_value
mask = mask ^ fill_mask

return self._from_sequence(result, mask=mask)
return type(self)(result, mask=mask, dtype=self.dtype)

def copy(self, deep=False):
if deep:
return type(self)(
self.data.copy(), mask=self.mask.copy())
self.data.copy(), mask=self.mask.copy(), dtype=self.dtype)
return type(self)(self)

def __setitem__(self, key, value):
Expand All @@ -230,11 +271,23 @@ def __len__(self):
return len(self.data)

def __repr__(self):
"""
Return a string representation for this object.
Invoked by unicode(df) in py2 only. Yields a Unicode String in both
py2/py3.
"""
klass = self.__class__.__name__
data = format_object_summary(self, default_pprint, False)
attrs = format_object_attrs(self)
space = " "

formatted = self._formatting_values()
return '{}({})'.format(
self.__class__.__name__,
formatted.tolist())
prepr = (u(",%s") %
space).join(u("%s=%s") % (k, v) for k, v in attrs)

res = u("%s(%s%s)") % (klass, data, prepr)

return res

@property
def nbytes(self):
Expand All @@ -251,7 +304,7 @@ def _na_value(self):
def _concat_same_type(cls, to_concat):
data = np.concatenate([x.data for x in to_concat])
mask = np.concatenate([x.mask for x in to_concat])
return cls(data, mask=mask)
return cls(data, mask=mask, dtype=to_concat[0].dtype)

def astype(self, dtype, copy=True):
"""Cast to a NumPy array with 'dtype'.
Expand All @@ -269,7 +322,22 @@ def astype(self, dtype, copy=True):
-------
array : ndarray
NumPy ndarray with 'dtype' for its dtype.
Raises
------
TypeError
if incompatible type with an IntegerDtype, equivalent of same_kind
casting
"""

# if we are astyping to an existing IntegerDtype we can fastpath
if isinstance(dtype, IntegerDtype):
result = self.data.astype(dtype.numpy_dtype,
casting='same_kind', copy=False)
return type(self)(result, mask=self.mask,
dtype=dtype, copy=False)

# coerce
data = self._coerce_to_ndarray()
return data.astype(dtype=dtype, copy=False)

Expand Down Expand Up @@ -412,56 +480,37 @@ def integer_arithmetic_method(self, other):
if is_float_dtype(result):
mask |= (result == np.inf) | (result == -np.inf)

return cls(result, mask=mask)
return cls(result, mask=mask, dtype=self.dtype, copy=False)

name = '__{name}__'.format(name=op.__name__)
return set_function_name(integer_arithmetic_method, name, cls)


class UnsignedIntegerArray(IntegerArray):
pass
IntegerArray._add_numeric_methods_binary()
IntegerArray._add_comparison_methods_binary()


module = sys.modules[__name__]


# create the Dtype
types = [(_integer_dtypes, IntegerDtype, _integer_formatter),
(_unsigned_dtypes, UnsignedIntegerDtype, _unsigned_formatter)]
for dtypes, superclass, formatter in types:
_dtypes = {}
for dtype in ['int8', 'int16', 'int32', 'int64',
'uint8', 'uint16', 'uint32', 'uint64']:

for dtype in dtypes:

name = formatter(dtype)
classname = "{}Dtype".format(name)
attributes_dict = {'type': getattr(np, dtype),
'name': name}
dtype_type = type(classname, (superclass, ), attributes_dict)
setattr(module, classname, dtype_type)

# register
registry.register(dtype_type)


# create the Array
types = [(_integer_dtypes, IntegerArray, _integer_formatter),
(_unsigned_dtypes, UnsignedIntegerArray, _unsigned_formatter)]
for dtypes, superclass, formatter in types:

for dtype in dtypes:

dtype_type = getattr(module, "{}Dtype".format(formatter(dtype)))
classname = "{}Array".format(formatter(dtype))
attributes_dict = {'dtype': dtype_type()}
array_type = type(classname, (superclass, ), attributes_dict)
setattr(module, classname, array_type)

# add ops
array_type._add_numeric_methods_binary()
array_type._add_comparison_methods_binary()

# set the Array type on the Dtype
dtype_type.array_type = array_type
if dtype.startswith('u'):
name = "U{}".format(dtype[1:].capitalize())
else:
name = dtype.capitalize()
classname = "{}Dtype".format(name)
attributes_dict = {'type': getattr(np, dtype),
'name': name}
dtype_type = type(classname, (IntegerDtype, ), attributes_dict)
setattr(module, classname, dtype_type)

# register
registry.register(dtype_type)
_dtypes[dtype] = dtype_type()


def make_data():
Expand Down

0 comments on commit 7c79ebc

Please sign in to comment.