From 0fb9f66506b560a7534b7172098f20f03c4ffb22 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 24 Feb 2018 10:27:38 +0000 Subject: [PATCH] initialization from dicts for py>=3.6 maintains insertion order --- doc/source/dsintro.rst | 34 ++++++++++++-- doc/source/whatsnew/v0.23.0.txt | 57 +++++++++++++++++++++-- pandas/core/common.py | 12 ++++- pandas/core/frame.py | 9 ++-- pandas/core/panel.py | 6 +-- pandas/core/series.py | 9 +++- pandas/core/sparse/frame.py | 7 ++- pandas/core/sparse/series.py | 4 ++ pandas/tests/frame/test_constructors.py | 20 +++++++- pandas/tests/io/test_excel.py | 8 ++-- pandas/tests/io/test_pytables.py | 2 +- pandas/tests/series/test_constructors.py | 14 +++++- pandas/tests/sparse/frame/test_frame.py | 12 +++++ pandas/tests/sparse/series/test_series.py | 14 +++++- pandas/tests/test_panel.py | 14 +++--- 15 files changed, 190 insertions(+), 32 deletions(-) diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index 1ba00b8fb6f233..56046ad229ead8 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -81,9 +81,26 @@ index is passed, one will be created having values ``[0, ..., len(data) - 1]``. **From dict** -If ``data`` is a dict, if **index** is passed the values in data corresponding -to the labels in the index will be pulled out. Otherwise, an index will be -constructed from the sorted keys of the dict, if possible. +.. note:: + + When the data is a dict, and index is not passed, the Series index + will be ordered by the dict's insertion order, if you're using Python + version >= 3.6 and Pandas version >= 0.23. + + If you're using Python < 3.6 or Pandas < 0.23, and index is not passed, + the Series index will be the lexically ordered list of dict keys. + +.. ipython:: python + + d = {'b' : 1, 'a' : 0, 'c' : 2} + pd.Series(d) + +If in the example above you were on a Python version lower than 3.6 or a Pandas +lower than 0.23, the Series would be ordered by the lexical order of the dict +keys (i.e. ['a', 'b', 'c'] rather than ['b', 'a', 'c']). + +If an index is passed, the values in data corresponding to the labels in the +index will be pulled out. .. ipython:: python @@ -243,12 +260,21 @@ not matching up to the passed index. If axis labels are not passed, they will be constructed from the input data based on common sense rules. +.. note:: + + When the data is a dict, and columns is not passed, the DataFrame columns + will be ordered by the dict's insertion order, if you're using Python + version >= 3.6 and Pandas >= 0.23. + + If you're using Python < 3.6 or Pandas < 0.23, and columns is not passed, + the DataFrame columns will be the lexically ordered list of dict keys. + From dict of Series or dicts ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The resulting **index** will be the **union** of the indexes of the various Series. If there are any nested dicts, these will first be converted to -Series. If no columns are passed, the columns will be the sorted list of dict +Series. If no columns are passed, the columns will be the ordered list of dict keys. .. ipython:: python diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 542e62aa374bef..6eee711b04acc7 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -3,7 +3,7 @@ v0.23.0 ------- -This is a major release from 0.21.1 and includes a number of API changes, +This is a major release from 0.22.0 and includes a number of API changes, deprecations, new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. @@ -249,7 +249,7 @@ The :func:`DataFrame.assign` now accepts dependent keyword arguments for python using ``.assign()`` to update an existing column. Previously, callables referring to other variables being updated would get the "old" values - Previous Behaviour: + Previous Behavior: .. code-block:: ipython @@ -262,7 +262,7 @@ The :func:`DataFrame.assign` now accepts dependent keyword arguments for python 1 3 -2 2 4 -3 - New Behaviour: + New Behavior: .. ipython:: python @@ -329,6 +329,57 @@ If installed, we now require: | openpyxl | 2.4.0 | | +-----------------+-----------------+----------+ +.. _whatsnew_0230.api_breaking.dict_insertion_order: + +Instantation from dicts preserves dict insertion order for python 3.6+ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Until Python 3.6, dicts in Python had no formally defined ordering. For Python +version 3.6 and later, dicts are ordered by insertion order, see +`PEP 468 `_. +Pandas will use the dict's insertion order, when creating Series or +DataFrames from dicts (:issue:`19018`) and you're using Python version 3.6 or +higher. + +Previous Behavior (and current behavior if on Python < 3.6): + +.. code-block:: ipython + + In [1]: pd.Series({'Income': 2000, + ... 'Expenses': -1500, + ... 'Taxes': -200, + ... 'Net result': 300}) + Expenses -1500 + Income 2000 + Net result 300 + Taxes -200 + dtype: int64 + +Note the Series above is ordered alphabetically by the index values. + +New Behavior (for Python >= 3.6): + +.. ipython:: python + + pd.Series({'Income': 2000, + 'Expenses': -1500, + 'Taxes': -200, + 'Net result': 300}) + +Notice that the Series is now ordered by insertion order. This new behavior is +used for all relevant pandas types (``Series``, ``DataFrame``, ``SparseSeries`` +and ``SparseDataFrame``). + +If you wish to retain the old behavior while using Python >= 3.6, you can use +``sort_index``: + +.. ipython:: python + + pd.Series({'Income': 2000, + 'Expenses': -1500, + 'Taxes': -200, + 'Net result': 300}).sort_index() + .. _whatsnew_0230.api_breaking.deprecate_panel: Deprecate Panel diff --git a/pandas/core/common.py b/pandas/core/common.py index c4fbcf28cbcae9..c4890dbd39ef1b 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -11,7 +11,7 @@ from pandas._libs import lib, tslib from pandas import compat -from pandas.compat import long, zip, iteritems +from pandas.compat import long, zip, iteritems, PY36, OrderedDict from pandas.core.config import get_option from pandas.core.dtypes.generic import ABCSeries, ABCIndex from pandas.core.dtypes.common import _NS_DTYPE @@ -186,6 +186,16 @@ def _try_sort(iterable): return listed +def _dict_keys_to_ordered_list(mapping): + # when pandas drops support for Python < 3.6, this function + # can be replaced by a simple list(mapping.keys()) + if PY36 or isinstance(mapping, OrderedDict): + keys = list(mapping.keys()) + else: + keys = _try_sort(mapping) + return keys + + def iterpairs(seq): """ Parameters diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ae8fb48a61fceb..ff4064b3f8c563 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -252,6 +252,11 @@ class DataFrame(NDFrame): ---------- data : numpy ndarray (structured or homogeneous), dict, or DataFrame Dict can contain Series, arrays, constants, or list-like objects + + .. versionchanged :: 0.23.0 + If data is a dict, argument order is maintained for Python 3.6 + and later. + index : Index or array-like Index to use for resulting frame. Will default to RangeIndex if no indexing information part of input data and no index provided @@ -460,9 +465,7 @@ def _init_dict(self, data, index, columns, dtype=None): arrays.append(v) else: - keys = list(data.keys()) - if not isinstance(data, OrderedDict): - keys = com._try_sort(keys) + keys = com._dict_keys_to_ordered_list(data) columns = data_names = Index(keys) arrays = [data[k] for k in keys] diff --git a/pandas/core/panel.py b/pandas/core/panel.py index fc7fad861df442..052d555df76f11 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -204,10 +204,8 @@ def _init_dict(self, data, axes, dtype=None): for k, v in compat.iteritems(data) if k in haxis) else: - ks = list(data.keys()) - if not isinstance(data, OrderedDict): - ks = com._try_sort(ks) - haxis = Index(ks) + keys = com._dict_keys_to_ordered_list(data) + haxis = Index(keys) for k, v in compat.iteritems(data): if isinstance(v, dict): diff --git a/pandas/core/series.py b/pandas/core/series.py index 660bf3f5d48054..069f0372ab6e1a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -54,7 +54,7 @@ from pandas import compat from pandas.io.formats.terminal import get_terminal_size from pandas.compat import ( - zip, u, OrderedDict, StringIO, range, get_range_parameters) + zip, u, OrderedDict, StringIO, range, get_range_parameters, PY36) from pandas.compat.numpy import function as nv import pandas.core.ops as ops @@ -130,6 +130,11 @@ class Series(base.IndexOpsMixin, generic.NDFrame): ---------- data : array-like, dict, or scalar value Contains data stored in Series + + .. versionchanged :: 0.23.0 + If data is a dict, argument order is maintained for Python 3.6 + and later. + index : array-like or Index (1d) Values must be hashable and have the same length as `data`. Non-unique index values are allowed. Will default to @@ -297,7 +302,7 @@ def _init_dict(self, data, index=None, dtype=None): # Now we just make sure the order is respected, if any if index is not None: s = s.reindex(index, copy=False) - elif not isinstance(data, OrderedDict): + elif not PY36 and not isinstance(data, OrderedDict): try: s = s.sort_index() except TypeError: diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index d89b1d681c4783..2cefbea7220982 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -39,6 +39,10 @@ class SparseDataFrame(DataFrame): Parameters ---------- data : same types as can be passed to DataFrame or scipy.sparse.spmatrix + .. versionchanged :: 0.23.0 + If data is a dict, argument order is maintained for Python 3.6 + and later. + index : array-like, optional column : array-like, optional default_kind : {'block', 'integer'}, default 'block' @@ -138,7 +142,8 @@ def _init_dict(self, data, index, columns, dtype=None): columns = _ensure_index(columns) data = {k: v for k, v in compat.iteritems(data) if k in columns} else: - columns = Index(com._try_sort(list(data.keys()))) + keys = com._dict_keys_to_ordered_list(data) + columns = Index(keys) if index is None: index = extract_index(list(data.values())) diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py index f8b98a1a400811..714cd09a27294e 100644 --- a/pandas/core/sparse/series.py +++ b/pandas/core/sparse/series.py @@ -42,6 +42,10 @@ class SparseSeries(Series): Parameters ---------- data : {array-like, Series, SparseSeries, dict} + .. versionchanged :: 0.23.0 + If data is a dict, argument order is maintained for Python 3.6 + and later. + kind : {'block', 'integer'} fill_value : float Code for missing value. Defaults depends on dtype. diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index e0b94815878dde..499751e8643318 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -15,7 +15,7 @@ from pandas.core.dtypes.common import is_integer_dtype from pandas.compat import (lmap, long, zip, range, lrange, lzip, - OrderedDict, is_platform_little_endian) + OrderedDict, is_platform_little_endian, PY36) from pandas import compat from pandas import (DataFrame, Index, Series, isna, MultiIndex, Timedelta, Timestamp, @@ -290,6 +290,24 @@ def test_constructor_dict(self): with tm.assert_raises_regex(ValueError, msg): DataFrame({'a': 0.7}, columns=['b']) + @pytest.mark.skipif(not PY36, reason='Insertion order for Python>=3.6') + def test_constructor_dict_order_insertion(self): + # GH19018 + # initialization ordering: by insertion order if python>= 3.6 + d = {'b': self.ts2, 'a': self.ts1} + frame = DataFrame(data=d) + expected = DataFrame(data=d, columns=list('ba')) + tm.assert_frame_equal(frame, expected) + + @pytest.mark.skipif(PY36, reason='order by value for Python<3.6') + def test_constructor_dict_order_by_values(self): + # GH19018 + # initialization ordering: by value if python<3.6 + d = {'b': self.ts2, 'a': self.ts1} + frame = DataFrame(data=d) + expected = DataFrame(data=d, columns=list('ab')) + tm.assert_frame_equal(frame, expected) + def test_constructor_multi_index(self): # GH 4078 # construction error with mi and all-nan frame diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 15d3062394d6e7..0b80af11520b53 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -762,17 +762,17 @@ def test_read_excel_multiindex_empty_level(self, ext): # GH 12453 with ensure_clean('.xlsx') as path: df = DataFrame({ - ('Zero', ''): {0: 0}, ('One', 'x'): {0: 1}, ('Two', 'X'): {0: 3}, - ('Two', 'Y'): {0: 7} + ('Two', 'Y'): {0: 7}, + ('Zero', ''): {0: 0} }) expected = DataFrame({ - ('Zero', 'Unnamed: 3_level_1'): {0: 0}, ('One', u'x'): {0: 1}, ('Two', u'X'): {0: 3}, - ('Two', u'Y'): {0: 7} + ('Two', u'Y'): {0: 7}, + ('Zero', 'Unnamed: 3_level_1'): {0: 0} }) df.to_excel(path) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 04da6da74059be..e690b1e302d8bf 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -2034,7 +2034,7 @@ def test_table_values_dtypes_roundtrip(self): 'bool': 1, 'int16': 1, 'int8': 1, 'int64': 1, 'object': 1, 'datetime64[ns]': 2}) result = result.sort_index() - result = expected.sort_index() + expected = expected.sort_index() tm.assert_series_equal(result, expected) def test_table_mixed_dtypes(self): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 25f425ffa00215..e0bfe41645a3f8 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -22,7 +22,7 @@ from pandas._libs import lib from pandas._libs.tslib import iNaT -from pandas.compat import lrange, range, zip, long +from pandas.compat import lrange, range, zip, long, PY36 from pandas.util.testing import assert_series_equal import pandas.util.testing as tm @@ -811,6 +811,18 @@ def test_constructor_dict(self): expected.iloc[1] = 1 assert_series_equal(result, expected) + def test_constructor_dict_order(self): + # GH19018 + # initialization ordering: by insertion order if python>= 3.6, else + # order by value + d = {'b': 1, 'a': 0, 'c': 2} + result = Series(d) + if PY36: + expected = Series([1, 0, 2], index=list('bac')) + else: + expected = Series([0, 1, 2], index=list('abc')) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')]) def test_constructor_dict_nan_key(self, value): # GH 18480 diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index ee0d63aff73672..1062de3119efc0 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -139,6 +139,18 @@ def test_constructor(self): repr(self.frame) + def test_constructor_dict_order(self): + # GH19018 + # initialization ordering: by insertion order if python>= 3.6, else + # order by value + d = {'b': [2, 3], 'a': [0, 1]} + frame = SparseDataFrame(data=d) + if compat.PY36: + expected = SparseDataFrame(data=d, columns=list('ba')) + else: + expected = SparseDataFrame(data=d, columns=list('ab')) + tm.assert_sp_frame_equal(frame, expected) + def test_constructor_ndarray(self): # no index or columns sp = SparseDataFrame(self.frame.values) diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py index 3f5d5a59cc5402..eb63c87820070e 100644 --- a/pandas/tests/sparse/series/test_series.py +++ b/pandas/tests/sparse/series/test_series.py @@ -14,7 +14,7 @@ from pandas.tseries.offsets import BDay import pandas.util.testing as tm import pandas.util._test_decorators as td -from pandas.compat import range +from pandas.compat import range, PY36 from pandas.core.reshape.util import cartesian_product import pandas.core.sparse.frame as spf @@ -114,6 +114,18 @@ def test_constructor_dict_input(self): result = SparseSeries(constructor_dict) tm.assert_sp_series_equal(result, expected) + def test_constructor_dict_order(self): + # GH19018 + # initialization ordering: by insertion order if python>= 3.6, else + # order by value + d = {'b': 1, 'a': 0, 'c': 2} + result = SparseSeries(d) + if PY36: + expected = SparseSeries([1, 0, 2], index=list('bac')) + else: + expected = SparseSeries([0, 1, 2], index=list('abc')) + tm.assert_sp_series_equal(result, expected) + def test_constructor_dtype(self): arr = SparseSeries([np.nan, 1, 2, np.nan]) assert arr.dtype == np.float64 diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 1955fc301be9b0..301a7fc437fcfc 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -2368,14 +2368,16 @@ def test_update_from_dict(self): pan.update(other) expected = Panel( - {'two': DataFrame([[3.6, 2., 3], - [1.5, np.nan, 7], - [1.5, np.nan, 3.], - [1.5, np.nan, 3.]]), - 'one': DataFrame([[1.5, np.nan, 3.], + {'one': DataFrame([[1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3.], - [1.5, np.nan, 3.]])}) + [1.5, np.nan, 3.]]), + 'two': DataFrame([[3.6, 2., 3], + [1.5, np.nan, 7], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]) + } + ) assert_panel_equal(pan, expected)