Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Construction of Series from dict containing NaN as key #18496

Merged
merged 3 commits into from
Dec 1, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ Other API Changes

- :func:`Series.astype` and :func:`Index.astype` with an incompatible dtype will now raise a ``TypeError`` rather than a ``ValueError`` (:issue:`18231`)
- ``Series`` construction with an ``object`` dtyped tz-aware datetime and ``dtype=object`` specified, will now return an ``object`` dtyped ``Series``, previously this would infer the datetime dtype (:issue:`18231`)
- A :class:`Series` of ``dtype=category`` constructed from an empty ``dict`` will now have categories of ``dtype=object`` rather than ``dtype=float64``, consistently with the case in which an empty list is passed (:issue:`18515`)
- ``NaT`` division with :class:`datetime.timedelta` will now return ``NaN`` instead of raising (:issue:`17876`)
- All-NaN levels in a ``MultiIndex`` are now assigned ``float`` rather than ``object`` dtype, promoting consistency with ``Index`` (:issue:`17929`).
- :class:`Timestamp` will no longer silently ignore unused or invalid ``tz`` or ``tzinfo`` keyword arguments (:issue:`17690`)
Expand Down Expand Up @@ -208,5 +209,6 @@ Other

- Improved error message when attempting to use a Python keyword as an identifier in a numexpr query (:issue:`18221`)
- Fixed a bug where creating a Series from an array that contains both tz-naive and tz-aware values will result in a Series whose dtype is tz-aware instead of object (:issue:`16406`)
- Fixed construction of a :class:`Series` from a ``dict`` containing ``NaN`` as key (:issue:`18480`)
- Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`)
-
5 changes: 2 additions & 3 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -874,9 +874,8 @@ def _map_values(self, mapper, na_action=None):
# convert to an Series for efficiency.
# we specify the keys here to handle the
# possibility that they are tuples
from pandas import Series, Index
index = Index(mapper, tupleize_cols=False)
mapper = Series(mapper, index=index)
from pandas import Series
mapper = Series(mapper)

if isinstance(mapper, ABCSeries):
# Since values were input this means we came from either
Expand Down
21 changes: 0 additions & 21 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2822,27 +2822,6 @@ def get_indexer_for(self, target, **kwargs):
indexer, _ = self.get_indexer_non_unique(target, **kwargs)
return indexer

_index_shared_docs['_get_values_from_dict'] = """
Return the values of the input dictionary in the order the keys are
in the index. np.nan is returned for index values not in the
dictionary.

Parameters
----------
data : dict
The dictionary from which to extract the values

Returns
-------
np.array

"""

@Appender(_index_shared_docs['_get_values_from_dict'])
def _get_values_from_dict(self, data):
return lib.fast_multiget(data, self.values,
default=np.nan)

def _maybe_promote(self, other):
# A hack, but it works
from pandas.core.indexes.datetimes import DatetimeIndex
Expand Down
8 changes: 0 additions & 8 deletions pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -698,14 +698,6 @@ def __rsub__(self, other):
def _add_delta(self, other):
return NotImplemented

@Appender(_index_shared_docs['_get_values_from_dict'])
def _get_values_from_dict(self, data):
if len(data):
return np.array([data.get(i, np.nan)
for i in self.asobject.values])

return np.array([np.nan])

def _add_delta_td(self, other):
# add a delta of a timedeltalike
# return the i8 result view
Expand Down
11 changes: 0 additions & 11 deletions pandas/core/indexes/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1457,17 +1457,6 @@ def get_value_maybe_box(self, series, key):
key, tz=self.tz)
return _maybe_box(self, values, series, key)

@Appender(_index_shared_docs['_get_values_from_dict'])
def _get_values_from_dict(self, data):
if len(data):
# coerce back to datetime objects for lookup
data = com._dict_compat(data)
return lib.fast_multiget(data,
self.asobject.values,
default=np.nan)

return np.array([np.nan])

def get_loc(self, key, method=None, tolerance=None):
"""
Get integer location for requested label
Expand Down
55 changes: 42 additions & 13 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
_default_index,
_asarray_tuplesafe,
_values_from_object,
_try_sort,
_maybe_match_name,
SettingWithCopyError,
_maybe_box_datetimelike,
Expand Down Expand Up @@ -198,18 +197,9 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
data = data.reindex(index, copy=copy)
data = data._data
elif isinstance(data, dict):
if index is None:
if isinstance(data, OrderedDict):
index = Index(data)
else:
index = Index(_try_sort(data))

try:
data = index._get_values_from_dict(data)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this was the only usage of _get_values_from_dict, so this could be cleaned up. There are also multiple implementation (for the different types of indices, not sure if those differences are important and are all catched in the new implementation)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The different implementations seem unused and broken, see for instance

In [2]: pd.Index([])._get_values_from_dict({})
Out[2]: array([], dtype=float64)

In [3]: pd.DatetimeIndex([])._get_values_from_dict({})
Out[3]: array([ nan])

however, in principle they do something sensible (not necessarily expected), which is to look for Timestamp keys in the dict. The "new implementation", that is the Series construction, doesn't care about this (and shouldn't, I think).

I'm OK with removing all of this if you want (in another PR).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It can be removed here (it is this PR that changes the implementation)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done, ping

except TypeError:
data = ([data.get(i, np.nan) for i in index]
if data else np.nan)

data, index = self._init_dict(data, index, dtype)
dtype = None
copy = False
elif isinstance(data, SingleBlockManager):
if index is None:
index = data.index
Expand Down Expand Up @@ -257,6 +247,45 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
self.name = name
self._set_axis(0, index, fastpath=True)

def _init_dict(self, data, index=None, dtype=None):
"""
Derive the "_data" and "index" attributes of a new Series from a
dictionary input.

Parameters
----------
data : dict or dict-like
Data used to populate the new Series
index : Index or index-like, default None
index for the new Series: if None, use dict keys
dtype : dtype, default None
dtype for the new Series: if None, infer from data

Returns
-------
_data : BlockManager for the new Series
index : index for the new Series
"""
# Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')]
# raises KeyError), so we iterate the entire dict, and align
if data:
keys, values = zip(*compat.iteritems(data))
else:
keys, values = [], []

# Input is now list-like, so rely on "standard" construction:
s = Series(values, index=keys, dtype=dtype)

# Now we just make sure the order is respected, if any
if index is not None:
s = s.reindex(index, copy=False)
elif not isinstance(data, OrderedDict):
try:
s = s.sort_index()
except TypeError:
pass
return s._data, s.index

@classmethod
def from_array(cls, arr, index=None, name=None, dtype=None, copy=False,
fastpath=False):
Expand Down
1 change: 1 addition & 0 deletions pandas/tests/series/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,7 @@ def test_map_dict_with_tuple_keys(self):
converted to a multi-index, preventing tuple values
from being mapped properly.
"""
# GH 18496
df = pd.DataFrame({'a': [(1, ), (2, ), (3, 4), (5, 6)]})
label_mappings = {(1, ): 'A', (2, ): 'B', (3, 4): 'A', (5, 6): 'B'}

Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/series/test_combine_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,8 @@ def test_concat_empty_series_dtypes(self):
# categorical
assert pd.concat([Series(dtype='category'),
Series(dtype='category')]).dtype == 'category'
assert pd.concat([Series(dtype='category'),
# GH 18515
assert pd.concat([Series(np.array([]), dtype='category'),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you change this? (it's only the dtype of the category, it is still a categorical series, so for the concat that does not matter)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the dtype of the category does matter for concat (and rightly so, since conceptually the fact that a categorical of ints is really a categorical is only an implementation detail when you're going to concat it to a non-categorical anyway).

(... or just try that test before and after my change)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does this PR change the behaviour of empty category series?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get that when #18515 is fixed, this test should be changed this way, but please leave that change for a PR that actually fixes #18515

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it clear to you that that test fails with this PR? If yes, please clarify/reformulate what you're asking for.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this PR changes (fixes) the dtype of categories for pd.Series(dtype='category'), please specify so and add an explicit test for this (and a whatsnew bug fix note).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So data is None is handled with data = {}, and thus this PR affects this behaviour? So this does fix #18515

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, asking is good. This PR makes (incidentally, but it is the correct behaviour) category series initialized with an empty dict behave like category series initialized with an empty list, that is have object dtype. This fixes #18515 (but not #17261 ). Since this test is about concatenating an empty float category Series to an empty float (non-category) Series, I had to fix the former so that it still had dtype float64.

So since you reopened #18515, now I will add "closes #18515" and push again.

OK?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep.
And in general, please add commits instead of amending if you make such additional changes. Makes reviewing easier.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done, ping

Series(dtype='float64')]).dtype == 'float64'
assert pd.concat([Series(dtype='category'),
Series(dtype='object')]).dtype == 'object'
Expand Down
55 changes: 47 additions & 8 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest

from datetime import datetime, timedelta
from collections import OrderedDict

from numpy import nan
import numpy as np
Expand Down Expand Up @@ -79,17 +80,42 @@ def test_constructor(self):
m = MultiIndex.from_arrays([[1, 2], [3, 4]])
pytest.raises(NotImplementedError, Series, m)

def test_constructor_empty(self):
@pytest.mark.parametrize('input_class', [list, dict, OrderedDict])
def test_constructor_empty(self, input_class):
empty = Series()
empty2 = Series([])
empty2 = Series(input_class())

# the are Index() and RangeIndex() which don't compare type equal
# these are Index() and RangeIndex() which don't compare type equal
# but are just .equals
assert_series_equal(empty, empty2, check_index_type=False)

empty = Series(index=lrange(10))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so I think these tests got eliminated? can you add another test (or add onto the construction via np.nan, None, float('nan') which hits this case?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test is wrong (dtype shouldn't be float64), the right one would fail, see #17261 . Adding an xfailing test.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The xfailing test would basically be this one with explicit dtype, it's better to change it when fixing #17261 . Adding a fixed test for this instead.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done, ping

empty2 = Series(np.nan, index=lrange(10))
assert_series_equal(empty, empty2)
# With explicit dtype:
empty = Series(dtype='float64')
empty2 = Series(input_class(), dtype='float64')
assert_series_equal(empty, empty2, check_index_type=False)

# GH 18515 : with dtype=category:
empty = Series(dtype='category')
empty2 = Series(input_class(), dtype='category')
assert_series_equal(empty, empty2, check_index_type=False)

if input_class is not list:
# With index:
empty = Series(index=lrange(10))
empty2 = Series(input_class(), index=lrange(10))
assert_series_equal(empty, empty2)

# With index and dtype float64:
empty = Series(np.nan, index=lrange(10))
empty2 = Series(input_class(), index=lrange(10), dtype='float64')
assert_series_equal(empty, empty2)

@pytest.mark.parametrize('input_arg', [np.nan, float('nan')])
def test_constructor_nan(self, input_arg):
empty = Series(dtype='float64', index=lrange(10))
empty2 = Series(input_arg, index=lrange(10))

assert_series_equal(empty, empty2, check_index_type=False)

def test_constructor_series(self):
index1 = ['d', 'b', 'a', 'c']
Expand Down Expand Up @@ -625,6 +651,21 @@ def test_constructor_dict(self):
expected.iloc[1] = 1
assert_series_equal(result, expected)

@pytest.mark.parametrize("value", [2, np.nan, None, float('nan')])
def test_constructor_dict_nan_key(self, value):
# GH 18480
d = {1: 'a', value: 'b', float('nan'): 'c', 4: 'd'}
result = Series(d).sort_values()
expected = Series(['a', 'b', 'c', 'd'], index=[1, value, np.nan, 4])
assert_series_equal(result, expected)

# MultiIndex:
d = {(1, 1): 'a', (2, np.nan): 'b', (3, value): 'c'}
result = Series(d).sort_values()
expected = Series(['a', 'b', 'c'],
index=Index([(1, 1), (2, np.nan), (3, value)]))
assert_series_equal(result, expected)

def test_constructor_dict_datetime64_index(self):
# GH 9456

Expand Down Expand Up @@ -658,8 +699,6 @@ def test_constructor_tuple_of_tuples(self):
s = Series(data)
assert tuple(s) == data

@pytest.mark.xfail(reason='GH 18480 (Series initialization from dict with '
'NaN keys')
def test_constructor_dict_of_tuples(self):
data = {(1, 2): 3,
(None, 5): 6}
Expand Down