pandas-dev · h-vetinari · Jan 10, 2019 · Jan 10, 2019 · Jan 10, 2019 · Jan 10, 2019
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -1251,8 +1251,8 @@ Other API Changes
 - :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`)
 - :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`)
 - :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`)
-- :meth:`DataFrame.set_index` now allows all one-dimensional list-likes, raises a ``TypeError`` for incorrect types,
-  has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`)
+- :meth:`DataFrame.set_index` now gives a better (and less frequent) KeyError, raises a ``ValueError`` for incorrect types,
+  and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`)
 - Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
 - :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`)
 - :meth:`Series.searchsorted`, when supplied a scalar value to search for, now returns a scalar instead of an array (:issue:`23801`).
@@ -1309,6 +1309,7 @@ Deprecations
 - In :meth:`Series.where` with Categorical data, providing an ``other`` that is not present in the categories is deprecated. Convert the categorical to a different dtype or add the ``other`` to the categories first (:issue:`24077`).
 - :meth:`Series.clip_lower`, :meth:`Series.clip_upper`, :meth:`DataFrame.clip_lower` and :meth:`DataFrame.clip_upper` are deprecated and will be removed in a future version. Use ``Series.clip(lower=threshold)``, ``Series.clip(upper=threshold)`` and the equivalent ``DataFrame`` methods (:issue:`24203`)
 - :meth:`Series.nonzero` is deprecated and will be removed in a future version (:issue:`18262`)
+- :meth:`DataFrame.set_index` has deprecated using lists of values *within* lists. It remains possible to pass array-likes, both directly and within a list.
 - Passing an integer to :meth:`Series.fillna` and :meth:`DataFrame.fillna` with ``timedelta64[ns]`` dtypes is deprecated, will raise ``TypeError`` in a future version.  Use ``obj.fillna(pd.Timedelta(...))` instead (:issue:`24694`)
 
 .. _whatsnew_0240.deprecations.datetimelike_int_ops:

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4041,12 +4041,18 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
         Set the DataFrame index using existing columns.
 
         Set the DataFrame index (row labels) using one or more existing
-        columns. The index can replace the existing index or expand on it.
+        columns or arrays (of the correct length). The index can replace the
+        existing index or expand on it.
 
         Parameters
         ----------
-        keys : label or list of label
-            Name or names of the columns that will be used as the index.
+        keys : label or array-like or list of labels/arrays
+            This parameter can be either a single column key, a single array of
+            the same length as the calling DataFrame, or a list containing an
+            arbitrary combination of column keys and arrays. Here, "array"
+            encompasses :class:`Series`, :class:`Index` and ``np.ndarray``.
+            Lists (in the sense of a sequence of values, not column labels)
+            have been deprecated, and will be removed in a future version.
         drop : bool, default True
             Delete columns to be used as the new index.
         append : bool, default False
@@ -4091,7 +4097,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
         7      2013    84
         10     2014    31
 
-        Create a multi-index using columns 'year' and 'month':
+        Create a MultiIndex using columns 'year' and 'month':
 
         >>> df.set_index(['year', 'month'])
                     sale
@@ -4101,38 +4107,62 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
         2013  7     84
         2014  10    31
 
-        Create a multi-index using a set of values and a column:
+        Create a MultiIndex using an Index and a column:
 
-        >>> df.set_index([[1, 2, 3, 4], 'year'])
+        >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year'])
                  month  sale
            year
         1  2012  1      55
         2  2014  4      40
         3  2013  7      84
         4  2014  10     31
+
+        Create a MultiIndex using two Series:
+
+        >>> s = pd.Series([1, 2, 3, 4])
+        >>> df.set_index([s, s**2])
+              month  year  sale
+        1 1       1  2012    55
+        2 4       4  2014    40
+        3 9       7  2013    84
+        4 16     10  2014    31
         """
         inplace = validate_bool_kwarg(inplace, 'inplace')
-        if not isinstance(keys, list):
+
+        err_msg = ('The parameter "keys" may be a column key, one-dimensional '
+                   'array, or a list containing only valid column keys and '
+                   'one-dimensional arrays.')
+
+        if (is_scalar(keys) or isinstance(keys, tuple)
+                or isinstance(keys, (ABCIndexClass, ABCSeries, np.ndarray))):
+            # make sure we have a container of keys/arrays we can iterate over
+            # tuples can appear as valid column keys!
             keys = [keys]
+        elif not isinstance(keys, list):
+            raise ValueError(err_msg)
 
         missing = []
+        depr_warn = False
         for col in keys:
             if (is_scalar(col) or isinstance(col, tuple)) and col in self:
-                # tuples can be both column keys or list-likes
-                # if they are valid column keys, everything is fine
+                # if col is a valid column key, everything is fine
                 continue
             elif is_scalar(col) and col not in self:
-                # tuples that are not column keys are considered list-like,
-                # not considered missing
+                # tuples that are not keys are not considered missing,
+                # but illegal (see below)
                 missing.append(col)
-            elif (not is_list_like(col, allow_sets=False)
+            elif isinstance(col, list):
+                depr_warn = True
+            elif (not isinstance(col, (ABCIndexClass, ABCSeries, np.ndarray))
                   or getattr(col, 'ndim', 1) > 1):
-                raise TypeError('The parameter "keys" may only contain a '
-                                'combination of valid column keys and '
-                                'one-dimensional list-likes')
+                raise ValueError(err_msg)
 
         if missing:
             raise KeyError('{}'.format(missing))
+        if depr_warn:
+            msg = ('passing lists within a list to the parameter "keys" is '
+                   'deprecated and will be removed in a future version.')
+            warnings.warn(msg, FutureWarning, stacklevel=2)
 
         if inplace:
             frame = self
@@ -4162,12 +4192,6 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
             elif isinstance(col, (list, np.ndarray)):
                 arrays.append(col)
                 names.append(None)
-            elif (is_list_like(col)
-                  and not (isinstance(col, tuple) and col in self)):
-                # all other list-likes (but avoid valid column keys)
-                col = list(col)  # ensure iterator do not get read twice etc.
-                arrays.append(col)
-                names.append(None)
             # from here, col can only be a column label
             else:
                 arrays.append(frame[col]._values)

diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py
@@ -117,8 +117,7 @@ def test_set_index_after_mutation(self):
     # MultiIndex constructor does not work directly on Series -> lambda
     # Add list-of-list constructor because list is ambiguous -> lambda
     # also test index name if append=True (name is duplicate here for B)
-    @pytest.mark.parametrize('box', [Series, Index, np.array,
-                                     list, tuple, iter, lambda x: [list(x)],
+    @pytest.mark.parametrize('box', [Series, Index, np.array, list,
                                      lambda x: MultiIndex.from_arrays([x])])
     @pytest.mark.parametrize('append, index_name', [(True, None),
                              (True, 'B'), (True, 'test'), (False, None)])
@@ -135,7 +134,7 @@ def test_set_index_pass_single_array(self, frame_of_index_cols,
             with pytest.raises(KeyError, match=msg):
                 df.set_index(key, drop=drop, append=append)
         else:
-            # np.array/tuple/iter/list-of-list "forget" the name of B
+            # np.array "forgets" the name of B
             name_mi = getattr(key, 'names', None)
             name = [getattr(key, 'name', None)] if name_mi is None else name_mi
 
@@ -150,8 +149,7 @@ def test_set_index_pass_single_array(self, frame_of_index_cols,
 
     # MultiIndex constructor does not work directly on Series -> lambda
     # also test index name if append=True (name is duplicate here for A & B)
-    @pytest.mark.parametrize('box', [Series, Index, np.array,
-                                     list, tuple, iter,
+    @pytest.mark.parametrize('box', [Series, Index, np.array, list,
                                      lambda x: MultiIndex.from_arrays([x])])
     @pytest.mark.parametrize('append, index_name',
                              [(True, None), (True, 'A'), (True, 'B'),
@@ -163,10 +161,14 @@ def test_set_index_pass_arrays(self, frame_of_index_cols,
         df.index.name = index_name
 
         keys = ['A', box(df['B'])]
-        # np.array/list/tuple/iter "forget" the name of B
-        names = ['A', None if box in [np.array, list, tuple, iter] else 'B']
+        # np.array/list "forget" the name of B
+        names = ['A', None if box in [np.array, list] else 'B']
 
-        result = df.set_index(keys, drop=drop, append=append)
+        if box == list:
+            with tm.assert_produces_warning(FutureWarning):
+                result = df.set_index(keys, drop=drop, append=append)
+        else:
+            result = df.set_index(keys, drop=drop, append=append)
 
         # only valid column keys are dropped
         # since B is always passed as array above, only A is dropped, if at all
@@ -179,12 +181,10 @@ def test_set_index_pass_arrays(self, frame_of_index_cols,
     # MultiIndex constructor does not work directly on Series -> lambda
     # We also emulate a "constructor" for the label -> lambda
     # also test index name if append=True (name is duplicate here for A)
-    @pytest.mark.parametrize('box2', [Series, Index, np.array,
-                                      list, tuple, iter,
+    @pytest.mark.parametrize('box2', [Series, Index, np.array, list,
                                       lambda x: MultiIndex.from_arrays([x]),
                                       lambda x: x.name])
-    @pytest.mark.parametrize('box1', [Series, Index, np.array,
-                                      list, tuple, iter,
+    @pytest.mark.parametrize('box1', [Series, Index, np.array, list,
                                       lambda x: MultiIndex.from_arrays([x]),
                                       lambda x: x.name])
     @pytest.mark.parametrize('append, index_name', [(True, None),
@@ -196,10 +196,12 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop,
         df.index.name = index_name
 
         keys = [box1(df['A']), box2(df['A'])]
-        result = df.set_index(keys, drop=drop, append=append)
 
-        # if either box was iter, the content has been consumed; re-read it
-        keys = [box1(df['A']), box2(df['A'])]
+        if box1 == list or box2 == list:
+            with tm.assert_produces_warning(FutureWarning):
+                result = df.set_index(keys, drop=drop, append=append)
+        else:
+            result = df.set_index(keys, drop=drop, append=append)
 
         # need to adapt first drop for case that both keys are 'A' --
         # cannot drop the same column twice;
@@ -208,9 +210,17 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop,
 
         # to test against already-tested behaviour, we add sequentially,
         # hence second append always True; must wrap keys in list, otherwise
-        # box = list would be illegal
-        expected = df.set_index([keys[0]], drop=first_drop, append=append)
-        expected = expected.set_index([keys[1]], drop=drop, append=True)
+        # box = list would be interpreted as keys
+        if box1 == list or box2 == list:
+            with tm.assert_produces_warning(FutureWarning):
+                expected = df.set_index([keys[0]], drop=first_drop,
+                                        append=append)
+                expected = expected.set_index([keys[1]], drop=drop,
+                                              append=True)
+        else:
+            expected = df.set_index([keys[0]], drop=first_drop, append=append)
+            expected = expected.set_index([keys[1]], drop=drop, append=True)
+
         tm.assert_frame_equal(result, expected)
 
     @pytest.mark.parametrize('append', [True, False])
@@ -249,13 +259,13 @@ def test_set_index_raise(self, frame_of_index_cols, drop, append):
         with pytest.raises(KeyError, match='X'):
             df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append)
 
-        msg = 'The parameter "keys" may only contain a combination of.*'
+        msg = 'The parameter "keys" may be a column key, .*'
         # forbidden type, e.g. set
-        with pytest.raises(TypeError, match=msg):
+        with pytest.raises(ValueError, match=msg):
             df.set_index(set(df['A']), drop=drop, append=append)
 
         # forbidden type in list, e.g. set
-        with pytest.raises(TypeError, match=msg):
+        with pytest.raises(ValueError, match=msg):
             df.set_index(['A', df['A'], set(df['A'])],
                          drop=drop, append=append)