diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index e838afdbbd083..4add3b9e80280 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -843,6 +843,7 @@ Categorical - Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`) - Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`) - Bug in :class:`Series` constructor with scalar and ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19565`) +- Bug in ``Categorical.__iter__`` not converting to Python types (:issue:`19909`) - Bug in :func:`pandas.factorize` returning the unique codes for the ``uniques``. This now returns a ``Categorical`` with the same dtype as the input (:issue:`19721`) - Bug in :func:`pandas.factorize` including an item for missing values in the ``uniques`` return value (:issue:`19721`) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 1f33081a5f610..fa565aa802faf 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -236,6 +236,59 @@ def isna(self): """ raise AbstractMethodError(self) + def fillna(self, value=None, method=None, limit=None): + """ Fill NA/NaN values using the specified method. + + Parameters + ---------- + value : scalar, array-like + If a scalar value is passed it is used to fill all missing values. + Alternatively, an array-like 'value' can be given. It's expected + that the array-like have the same length as 'self'. + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. + + Returns + ------- + filled : ExtensionArray with NA/NaN filled + """ + from pandas.api.types import is_scalar + from pandas.util._validators import validate_fillna_kwargs + from pandas.core.missing import pad_1d, backfill_1d + + value, method = validate_fillna_kwargs(value, method) + + mask = self.isna() + + if not is_scalar(value): + if len(value) != len(self): + raise ValueError("Length of 'value' does not match. Got ({}) " + " expected {}".format(len(value), len(self))) + value = value[mask] + + if mask.any(): + if method is not None: + func = pad_1d if method == 'pad' else backfill_1d + new_values = func(self.astype(object), limit=limit, + mask=mask) + new_values = self._constructor_from_sequence(new_values) + else: + # fill with value + new_values = self.copy() + new_values[mask] = value + else: + new_values = self.copy() + return new_values + def unique(self): """Compute the ExtensionArray of unique values. @@ -285,6 +338,7 @@ def take(self, indexer, allow_fill=True, fill_value=None): .. code-block:: python def take(self, indexer, allow_fill=True, fill_value=None): + indexer = np.asarray(indexer) mask = indexer == -1 result = self.data.take(indexer) result[mask] = np.nan # NA for this type diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b37f88d8bfdce..43b8bc6244637 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -480,9 +480,7 @@ def tolist(self): (for str, int, float) or a pandas scalar (for Timestamp/Timedelta/Interval/Period) """ - if is_datetimelike(self.categories): - return [com._maybe_box_datetimelike(x) for x in self] - return np.array(self).tolist() + return list(self) @property def base(self): @@ -1592,16 +1590,16 @@ def fillna(self, value=None, method=None, limit=None): Parameters ---------- - method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None - Method to use for filling holes in reindexed Series - pad / ffill: propagate last valid observation forward to next valid - backfill / bfill: use NEXT valid observation to fill gap value : scalar, dict, Series If a scalar value is passed it is used to fill all missing values. Alternatively, a Series or dict can be used to fill in different values for each index. The value should not be a list. The value(s) passed should either be in the categories or should be NaN. + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap limit : int, default None (Not implemented yet for Categorical!) If method is specified, this is the maximum number of consecutive @@ -1717,7 +1715,7 @@ def __len__(self): def __iter__(self): """Returns an Iterator over the values of this Categorical.""" - return iter(self.get_values()) + return iter(self.get_values().tolist()) def _tidy_repr(self, max_vals=10, footer=True): """ a short repr displaying only max_vals and an optional (but default diff --git a/pandas/core/base.py b/pandas/core/base.py index 257b26b64e642..3ccfe88a1d54d 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -9,10 +9,10 @@ from pandas.core.dtypes.missing import isna from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries, ABCIndexClass from pandas.core.dtypes.common import ( + is_datetimelike, is_object_dtype, is_list_like, is_scalar, - is_datetimelike, is_extension_type, is_extension_array_dtype) @@ -826,9 +826,10 @@ def tolist(self): -------- numpy.ndarray.tolist """ - - if is_datetimelike(self): + if is_datetimelike(self._values): return [com._maybe_box_datetimelike(x) for x in self._values] + elif is_extension_array_dtype(self._values): + return list(self._values) else: return self._values.tolist() diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index b1d0dc2a2442e..747dfe62e1fb7 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -11,7 +11,8 @@ from .common import (_ensure_object, is_bool, is_integer, is_float, is_complex, is_datetimetz, is_categorical_dtype, is_datetimelike, - is_extension_type, is_object_dtype, + is_extension_type, + is_object_dtype, is_datetime64tz_dtype, is_datetime64_dtype, is_datetime64_ns_dtype, is_timedelta64_dtype, is_timedelta64_ns_dtype, diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 240c9b1f3377c..81baa3734f12f 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1963,6 +1963,23 @@ def concat_same_type(self, to_concat, placement=None): return self.make_block_same_class(values, ndim=self.ndim, placement=placement) + def fillna(self, value, limit=None, inplace=False, downcast=None, + mgr=None): + values = self.values if inplace else self.values.copy() + values = values.fillna(value=value, limit=limit) + return [self.make_block_same_class(values=values, + placement=self.mgr_locs, + ndim=self.ndim)] + + def interpolate(self, method='pad', axis=0, inplace=False, limit=None, + fill_value=None, **kwargs): + + values = self.values if inplace else self.values.copy() + return self.make_block_same_class( + values=values.fillna(value=fill_value, method=method, + limit=limit), + placement=self.mgr_locs) + class NumericBlock(Block): __slots__ = () @@ -2522,27 +2539,6 @@ def _try_coerce_result(self, result): return result - def fillna(self, value, limit=None, inplace=False, downcast=None, - mgr=None): - # we may need to upcast our fill to match our dtype - if limit is not None: - raise NotImplementedError("specifying a limit for 'fillna' has " - "not been implemented yet") - - values = self.values if inplace else self.values.copy() - values = self._try_coerce_result(values.fillna(value=value, - limit=limit)) - return [self.make_block(values=values)] - - def interpolate(self, method='pad', axis=0, inplace=False, limit=None, - fill_value=None, **kwargs): - - values = self.values if inplace else self.values.copy() - return self.make_block_same_class( - values=values.fillna(fill_value=fill_value, method=method, - limit=limit), - placement=self.mgr_locs) - def shift(self, periods, axis=0, mgr=None): return self.make_block_same_class(values=self.values.shift(periods), placement=self.mgr_locs) diff --git a/pandas/tests/categorical/test_dtypes.py b/pandas/tests/categorical/test_dtypes.py index 8973d1196f6a9..00e99db628c2a 100644 --- a/pandas/tests/categorical/test_dtypes.py +++ b/pandas/tests/categorical/test_dtypes.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- - import pytest import numpy as np import pandas.util.testing as tm from pandas.core.dtypes.dtypes import CategoricalDtype -from pandas import Categorical, Index, CategoricalIndex, Series +from pandas.compat import long +from pandas import Categorical, Index, CategoricalIndex, Series, Timestamp class TestCategoricalDtypes(object): @@ -161,3 +161,16 @@ def test_astype_category(self, dtype_ordered, cat_ordered): result = cat.astype('category') expected = cat tm.assert_categorical_equal(result, expected) + + def test_iter_python_types(self): + # GH-19909 + # TODO(Py2): Remove long + cat = Categorical([1, 2]) + assert isinstance(list(cat)[0], (int, long)) + assert isinstance(cat.tolist()[0], (int, long)) + + def test_iter_python_types_datetime(self): + cat = Categorical([Timestamp('2017-01-01'), + Timestamp('2017-01-02')]) + assert isinstance(list(cat)[0], Timestamp) + assert isinstance(cat.tolist()[0], Timestamp) diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index adc690939b36c..74fe8f196a089 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -11,3 +11,8 @@ def test_astype_object_series(self, all_data): ser = pd.Series({"A": all_data}) result = ser.astype(object) assert isinstance(result._data.blocks[0], ObjectBlock) + + def test_tolist(self, data): + result = pd.Series(data).tolist() + expected = list(data) + assert result == expected diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 3ae82fa1ca432..bf404ac01bf2b 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import pandas as pd import pandas.util.testing as tm @@ -45,3 +46,71 @@ def test_dropna_frame(self, data_missing): result = df.dropna() expected = df.iloc[:0] self.assert_frame_equal(result, expected) + + def test_fillna_limit_pad(self, data_missing): + arr = data_missing.take([1, 0, 0, 0, 1]) + result = pd.Series(arr).fillna(method='ffill', limit=2) + expected = pd.Series(data_missing.take([1, 1, 1, 0, 1])) + self.assert_series_equal(result, expected) + + def test_fillna_limit_backfill(self, data_missing): + arr = data_missing.take([1, 0, 0, 0, 1]) + result = pd.Series(arr).fillna(method='backfill', limit=2) + expected = pd.Series(data_missing.take([1, 0, 1, 1, 1])) + self.assert_series_equal(result, expected) + + def test_fillna_series(self, data_missing): + fill_value = data_missing[1] + ser = pd.Series(data_missing) + + result = ser.fillna(fill_value) + expected = pd.Series(type(data_missing)([fill_value, fill_value])) + self.assert_series_equal(result, expected) + + # Fill with a series + result = ser.fillna(expected) + self.assert_series_equal(result, expected) + + # Fill with a series not affecting the missing values + result = ser.fillna(ser) + self.assert_series_equal(result, ser) + + @pytest.mark.parametrize('method', ['ffill', 'bfill']) + def test_fillna_series_method(self, data_missing, method): + fill_value = data_missing[1] + + if method == 'ffill': + data_missing = type(data_missing)(data_missing[::-1]) + + result = pd.Series(data_missing).fillna(method=method) + expected = pd.Series(type(data_missing)([fill_value, fill_value])) + + self.assert_series_equal(result, expected) + + def test_fillna_frame(self, data_missing): + fill_value = data_missing[1] + + result = pd.DataFrame({ + "A": data_missing, + "B": [1, 2] + }).fillna(fill_value) + + expected = pd.DataFrame({ + "A": type(data_missing)([fill_value, fill_value]), + "B": [1, 2], + }) + + self.assert_frame_equal(result, expected) + + def test_fillna_fill_other(self, data): + result = pd.DataFrame({ + "A": data, + "B": [np.nan] * len(data) + }).fillna({"B": 0.0}) + + expected = pd.DataFrame({ + "A": data, + "B": [0.0] * len(result), + }) + + self.assert_frame_equal(result, expected) diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py index 8f413b4a19730..b6dd181c1d8f3 100644 --- a/pandas/tests/extension/category/test_categorical.py +++ b/pandas/tests/extension/category/test_categorical.py @@ -69,7 +69,14 @@ def test_getitem_scalar(self): class TestMissing(base.BaseMissingTests): - pass + + @pytest.mark.skip(reason="Not implemented") + def test_fillna_limit_pad(self): + pass + + @pytest.mark.skip(reason="Not implemented") + def test_fillna_limit_backfill(self): + pass class TestMethods(base.BaseMethodsTests): diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 736556e4be20d..f1852542088ff 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -71,6 +71,7 @@ def isna(self): return np.array([x.is_nan() for x in self.values]) def take(self, indexer, allow_fill=True, fill_value=None): + indexer = np.asarray(indexer) mask = indexer == -1 indexer = _ensure_platform_int(indexer) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 7b4d079ecad87..01ae092bc1521 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -35,68 +35,59 @@ def na_value(): return decimal.Decimal("NaN") -class TestDtype(base.BaseDtypeTests): - pass +class BaseDecimal(object): + @staticmethod + def assert_series_equal(left, right, *args, **kwargs): + # tm.assert_series_equal doesn't handle Decimal('NaN'). + # We will ensure that the NA values match, and then + # drop those values before moving on. + left_na = left.isna() + right_na = right.isna() -class TestInterface(base.BaseInterfaceTests): - pass + tm.assert_series_equal(left_na, right_na) + tm.assert_series_equal(left[~left_na], right[~right_na], + *args, **kwargs) + @staticmethod + def assert_frame_equal(left, right, *args, **kwargs): + # TODO(EA): select_dtypes + decimals = (left.dtypes == 'decimal').index -class TestConstructors(base.BaseConstructorsTests): - pass + for col in decimals: + BaseDecimal.assert_series_equal(left[col], right[col], + *args, **kwargs) + left = left.drop(columns=decimals) + right = right.drop(columns=decimals) + tm.assert_frame_equal(left, right, *args, **kwargs) -class TestReshaping(base.BaseReshapingTests): - def test_align(self, data, na_value): - # Have to override since assert_series_equal doesn't - # compare Decimal(NaN) properly. - a = data[:3] - b = data[2:5] - r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) +class TestDtype(BaseDecimal, base.BaseDtypeTests): + pass - # NaN handling - e1 = pd.Series(type(data)(list(a) + [na_value])) - e2 = pd.Series(type(data)([na_value] + list(b))) - tm.assert_series_equal(r1.iloc[:3], e1.iloc[:3]) - assert r1[3].is_nan() - assert e1[3].is_nan() - tm.assert_series_equal(r2.iloc[1:], e2.iloc[1:]) - assert r2[0].is_nan() - assert e2[0].is_nan() +class TestInterface(BaseDecimal, base.BaseInterfaceTests): + pass - def test_align_frame(self, data, na_value): - # Override for Decimal(NaN) comparison - a = data[:3] - b = data[2:5] - r1, r2 = pd.DataFrame({'A': a}).align( - pd.DataFrame({'A': b}, index=[1, 2, 3]) - ) - # Assumes that the ctor can take a list of scalars of the type - e1 = pd.DataFrame({'A': type(data)(list(a) + [na_value])}) - e2 = pd.DataFrame({'A': type(data)([na_value] + list(b))}) +class TestConstructors(BaseDecimal, base.BaseConstructorsTests): + pass - tm.assert_frame_equal(r1.iloc[:3], e1.iloc[:3]) - assert r1.loc[3, 'A'].is_nan() - assert e1.loc[3, 'A'].is_nan() - tm.assert_frame_equal(r2.iloc[1:], e2.iloc[1:]) - assert r2.loc[0, 'A'].is_nan() - assert e2.loc[0, 'A'].is_nan() +class TestReshaping(BaseDecimal, base.BaseReshapingTests): + pass -class TestGetitem(base.BaseGetitemTests): +class TestGetitem(BaseDecimal, base.BaseGetitemTests): pass -class TestMissing(base.BaseMissingTests): +class TestMissing(BaseDecimal, base.BaseMissingTests): pass -class TestMethods(base.BaseMethodsTests): +class TestMethods(BaseDecimal, base.BaseMethodsTests): @pytest.mark.parametrize('dropna', [True, False]) @pytest.mark.xfail(reason="value_counts not implemented yet.") def test_value_counts(self, all_data, dropna): @@ -112,7 +103,7 @@ def test_value_counts(self, all_data, dropna): tm.assert_series_equal(result, expected) -class TestCasting(base.BaseCastingTests): +class TestCasting(BaseDecimal, base.BaseCastingTests): pass diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index e0721bb1d8d1a..16d5e4415a79f 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -60,7 +60,13 @@ class TestGetitem(base.BaseGetitemTests): class TestMissing(base.BaseMissingTests): - pass + @pytest.mark.xfail(reason="Setting a dict as a scalar") + def test_fillna_series(self): + """We treat dictionaries as a mapping in fillna, not a scalar.""" + + @pytest.mark.xfail(reason="Setting a dict as a scalar") + def test_fillna_frame(self): + """We treat dictionaries as a mapping in fillna, not a scalar.""" class TestMethods(base.BaseMethodsTests):