initialization from dicts for py>=3.6 maintains insertion order

pandas-dev · Feb 25, 2018 · 06d3c33 · 06d3c33
1 parent feedf66
commit 06d3c33
Show file tree

Hide file tree

Showing 10 changed files with 148 additions and 14 deletions.
diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst
@@ -81,9 +81,21 @@ index is passed, one will be created having values ``[0, ..., len(data) - 1]``.
 
 **From dict**
 
-If ``data`` is a dict, if **index** is passed the values in data corresponding
-to the labels in the index will be pulled out. Otherwise, an index will be
-constructed from the sorted keys of the dict, if possible.
+When creating a pandas Series from a dict, the Series will be ordered by the
+dict's insertion order, if you are using Python 3.6+ and no index has been
+supplied.
+
+.. ipython:: python
+
+   d = {'b' : 1, 'a' : 0 'c' : 2}
+   pd.Series(d)
+
+If you are a Python version lower than 3.6, and no index is passed, the
+series will be sorted by the lexical order of the keys of the dict
+(i.e. ['a', 'b', 'c'] in the example above).
+
+If an index is passed, the values in data corresponding to the labels in the
+index will be pulled out.
 
 .. ipython:: python
 
@@ -277,6 +289,8 @@ The row and column labels can be accessed respectively by accessing the
 From dict of ndarrays / lists
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+The columns will be ordered by the dict insertion order, unless you're using
+Python version < 3.6, then the columns will be ordered lexically/alphabetically.
 The ndarrays must all be the same length. If an index is passed, it must
 clearly also be the same length as the arrays. If no index is passed, the
 result will be ``range(n)``, where ``n`` is the array length.

diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -3,7 +3,7 @@
 v0.23.0
 -------
 
-This is a major release from 0.21.1 and includes a number of API changes,
+This is a major release from 0.22.0 and includes a number of API changes,
 deprecations, new features, enhancements, and performance improvements along
 with a large number of bug fixes. We recommend that all users upgrade to this
 version.
@@ -240,7 +240,7 @@ The :func:`DataFrame.assign` now accepts dependent keyword arguments for python
   using ``.assign()`` to update an existing column. Previously, callables
   referring to other variables being updated would get the "old" values
 
-  Previous Behaviour:
+  Previous behaviour:
 
   .. code-block:: ipython
 
@@ -253,7 +253,7 @@ The :func:`DataFrame.assign` now accepts dependent keyword arguments for python
       1  3 -2
       2  4 -3
 
-  New Behaviour:
+  New behaviour:
 
   .. ipython:: python
 
@@ -320,6 +320,57 @@ If installed, we now require:
 | openpyxl        | 2.4.0           |          |
 +-----------------+-----------------+----------+
 
+.. _whatsnew_0230.api_breaking.dict_insertion_order:
+
+Creating dataframes and series from dicts preserves dict insertion order for python 3.6+
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Until Python 3.6, dicts in Python had no formally defined ordering. Python
+version 3.6 and later have changed the ordering definition of dicts, so dicts
+in these newer versions are ordered by insertion order
+(see also `PEP 468 <https://www.python.org/dev/peps/pep-0468/>`_).
+Pandas will from version 0.23 use insertion order, when creating series or
+data frames from dicts (:issue:`19018`) .
+
+Previous behaviour (and current behaviour if on Python < 3.6):
+
+.. code-block:: ipython
+
+   In [1]: pd.Series({'Income': 2000,
+   ...                 'Expenses': -1500,
+   ...                 'Taxes': -200,
+   ...                 'Net result': 300})
+   Expenses     -1500
+   Income        2000
+   Net result     300
+   Taxes         -200
+   dtype: int64
+
+Note the series above is ordered alphabetically by the index values.
+
+New behaviour (for Python >= 3.6):
+
+.. ipython:: python
+
+    pd.Series({'Income': 2000,
+               'Expenses': -1500,
+               'Taxes': -200,
+               'Net result': 300})
+
+Notice that the series is now ordered by insertion order. This new behaviour is
+used for all relevant pandas types (``Series``, ``DataFrame``, ``SparseSeries``
+and ``SparseDataFrame``).
+
+If you wish to retain the old behaviour while using Python >= 3.6, you can use
+``sort_index``:
+
+.. ipython:: python
+
+    pd.Series({'Income': 2000,
+               'Expenses': -1500,
+               'Taxes': -200,
+               'Net result': 300}).sort_index()
+
 .. _whatsnew_0230.api_breaking.deprecate_panel:
 
 Deprecate Panel

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -251,6 +251,11 @@ class DataFrame(NDFrame):
     ----------
     data : numpy ndarray (structured or homogeneous), dict, or DataFrame
         Dict can contain Series, arrays, constants, or list-like objects
+
+        .. versionchanged :: 0.23.0
+           If data is a dict, argument order is maintained for Python 3.6
+           and later.
+
     index : Index or array-like
         Index to use for resulting frame. Will default to RangeIndex if
         no indexing information part of input data and no index provided
@@ -460,7 +465,7 @@ def _init_dict(self, data, index, columns, dtype=None):
 
         else:
             keys = list(data.keys())
-            if not isinstance(data, OrderedDict):
+            if not PY36 and not isinstance(data, OrderedDict):
                 keys = com._try_sort(keys)
             columns = data_names = Index(keys)
             arrays = [data[k] for k in keys]

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -54,7 +54,7 @@
 from pandas import compat
 from pandas.io.formats.terminal import get_terminal_size
 from pandas.compat import (
-    zip, u, OrderedDict, StringIO, range, get_range_parameters)
+    zip, u, OrderedDict, StringIO, range, get_range_parameters, PY36)
 from pandas.compat.numpy import function as nv
 
 import pandas.core.ops as ops
@@ -130,6 +130,11 @@ class Series(base.IndexOpsMixin, generic.NDFrame):
     ----------
     data : array-like, dict, or scalar value
         Contains data stored in Series
+
+        .. versionchanged :: 0.23.0
+           If data is a dict, argument order is maintained for Python 3.6
+           and later.
+
     index : array-like or Index (1d)
         Values must be hashable and have the same length as `data`.
         Non-unique index values are allowed. Will default to
@@ -286,7 +291,7 @@ def _init_dict(self, data, index=None, dtype=None):
         # Now we just make sure the order is respected, if any
         if index is not None:
             s = s.reindex(index, copy=False)
-        elif not isinstance(data, OrderedDict):
+        elif not PY36 and not isinstance(data, OrderedDict):
             try:
                 s = s.sort_index()
             except TypeError:

diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py
@@ -6,7 +6,7 @@
 # pylint: disable=E1101,E1103,W0231,E0202
 
 import warnings
-from pandas.compat import lmap
+from pandas.compat import lmap, OrderedDict, PY36
 from pandas import compat
 import numpy as np
 
@@ -39,6 +39,10 @@ class SparseDataFrame(DataFrame):
     Parameters
     ----------
     data : same types as can be passed to DataFrame or scipy.sparse.spmatrix
+        .. versionchanged :: 0.23.0
+           If data is a dict, argument order is maintained for Python 3.6
+           and later.
+
     index : array-like, optional
     column : array-like, optional
     default_kind : {'block', 'integer'}, default 'block'
@@ -138,7 +142,10 @@ def _init_dict(self, data, index, columns, dtype=None):
             columns = _ensure_index(columns)
             data = {k: v for k, v in compat.iteritems(data) if k in columns}
         else:
-            columns = Index(com._try_sort(list(data.keys())))
+            keys = list(data.keys())
+            if not PY36 and not isinstance(data, OrderedDict):
+                keys = com._try_sort(keys)
+            columns = Index(keys)
 
         if index is None:
             index = extract_index(list(data.values()))

diff --git a/pandas/core/sparse/series.py b/pandas/core/sparse/series.py
@@ -42,6 +42,10 @@ class SparseSeries(Series):
     Parameters
     ----------
     data : {array-like, Series, SparseSeries, dict}
+        .. versionchanged :: 0.23.0
+           If data is a dict, argument order is maintained for Python 3.6
+           and later.
+
     kind : {'block', 'integer'}
     fill_value : float
         Code for missing value. Defaults depends on dtype.

diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -15,7 +15,7 @@
 
 from pandas.core.dtypes.common import is_integer_dtype
 from pandas.compat import (lmap, long, zip, range, lrange, lzip,
-                           OrderedDict, is_platform_little_endian)
+                           OrderedDict, is_platform_little_endian, PY36)
 from pandas import compat
 from pandas import (DataFrame, Index, Series, isna,
                     MultiIndex, Timedelta, Timestamp,
@@ -290,6 +290,18 @@ def test_constructor_dict(self):
         with tm.assert_raises_regex(ValueError, msg):
             DataFrame({'a': 0.7}, columns=['b'])
 
+    def test_constructor_dict_order(self):
+        # GH19018
+        # initialization ordering: by insertion order if python>= 3.6, else
+        # order by value
+        d = {'b': self.ts2, 'a': self.ts1}
+        frame = DataFrame(data=d)
+        if compat.PY36:
+            expected = DataFrame(data=d, columns=list('ba'))
+        else:
+            expected = DataFrame(data=d, columns=list('ab'))
+        tm.assert_frame_equal(frame, expected)
+
     def test_constructor_multi_index(self):
         # GH 4078
         # construction error with mi and all-nan frame

diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
@@ -22,7 +22,7 @@
 from pandas._libs import lib
 from pandas._libs.tslib import iNaT
 
-from pandas.compat import lrange, range, zip, long
+from pandas.compat import lrange, range, zip, long, PY36
 from pandas.util.testing import assert_series_equal
 import pandas.util.testing as tm
 
@@ -783,6 +783,18 @@ def test_constructor_dict(self):
         expected.iloc[1] = 1
         assert_series_equal(result, expected)
 
+    def test_constructor_dict_order(self):
+        # GH19018
+        # initialization ordering: by insertion order if python>= 3.6, else
+        # order by value
+        d = {'b': 1, 'a': 0, 'c': 2}
+        result = Series(d)
+        if PY36:
+            expected = Series([1, 0, 2], index=list('bac'))
+        else:
+            expected = Series([0, 1, 2], index=list('abc'))
+        tm.assert_series_equal(result, expected)
+
     @pytest.mark.parametrize("value", [2, np.nan, None, float('nan')])
     def test_constructor_dict_nan_key(self, value):
         # GH 18480

diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py
@@ -139,6 +139,18 @@ def test_constructor(self):
 
         repr(self.frame)
 
+    def test_constructor_dict_order(self):
+        # GH19018
+        # initialization ordering: by insertion order if python>= 3.6, else
+        # order by value
+        d = {'b': [2, 3], 'a': [0, 1]}
+        frame = SparseDataFrame(data=d)
+        if compat.PY36:
+            expected = SparseDataFrame(data=d, columns=list('ba'))
+        else:
+            expected = SparseDataFrame(data=d, columns=list('ab'))
+        tm.assert_sp_frame_equal(frame, expected)
+
     def test_constructor_ndarray(self):
         # no index or columns
         sp = SparseDataFrame(self.frame.values)

diff --git a/pandas/tests/sparse/series/test_series.py b/pandas/tests/sparse/series/test_series.py
@@ -14,7 +14,7 @@
 from pandas.tseries.offsets import BDay
 import pandas.util.testing as tm
 import pandas.util._test_decorators as td
-from pandas.compat import range
+from pandas.compat import range, PY36
 from pandas.core.reshape.util import cartesian_product
 
 import pandas.core.sparse.frame as spf
@@ -114,6 +114,18 @@ def test_constructor_dict_input(self):
         result = SparseSeries(constructor_dict)
         tm.assert_sp_series_equal(result, expected)
 
+    def test_constructor_dict_order(self):
+        # GH19018
+        # initialization ordering: by insertion order if python>= 3.6, else
+        # order by value
+        d = {'b': 1, 'a': 0, 'c': 2}
+        result = SparseSeries(d)
+        if PY36:
+            expected = SparseSeries([1, 0, 2], index=list('bac'))
+        else:
+            expected = SparseSeries([0, 1, 2], index=list('abc'))
+        tm.assert_sp_series_equal(result, expected)
+
     def test_constructor_dtype(self):
         arr = SparseSeries([np.nan, 1, 2, np.nan])
         assert arr.dtype == np.float64