ENH: add set_index to Series

pandas-dev · Oct 19, 2018 · 949e699 · 949e699
1 parent 145c227
commit 949e699
Show file tree

Hide file tree

Showing 5 changed files with 385 additions and 95 deletions.
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -194,6 +194,7 @@ Other Enhancements
   The default compression for ``to_csv``, ``to_json``, and ``to_pickle`` methods has been updated to ``'infer'`` (:issue:`22004`).
 - :func:`to_timedelta` now supports iso-formated timedelta strings (:issue:`21877`)
 - :class:`Series` and :class:`DataFrame` now support :class:`Iterable` in constructor (:issue:`2193`)
+- :class:`Series` has gained the method :meth:`Series.set_index`, which works like its :class:`DataFrame` counterpart :meth:`DataFrame.set_index` (:issue:`21684`)
 - :class:`DatetimeIndex` gained :attr:`DatetimeIndex.timetz` attribute. Returns local time with timezone information. (:issue:`21358`)
 - :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support an ``ambiguous`` argument for handling datetimes that are rounded to ambiguous times (:issue:`18946`)
 - :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`).

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -83,6 +83,7 @@
 from pandas.core.accessor import CachedAccessor
 from pandas.core.arrays import Categorical, ExtensionArray
 from pandas.core.config import get_option
+
 from pandas.core.generic import NDFrame, _shared_docs
 from pandas.core.index import (Index, MultiIndex, ensure_index,
                                ensure_index_from_sequences)
@@ -3923,45 +3924,56 @@ def shift(self, periods=1, freq=None, axis=0):
     def set_index(self, keys, drop=True, append=False, inplace=False,
                   verify_integrity=False):
         """
-        Set the DataFrame index (row labels) using one or more existing
-        columns. By default yields a new object.
+        Set the DataFrame index (row labels) using one or more columns.
 
         Parameters
         ----------
         keys : column label or list of column labels / arrays
+            Either a column label, Series, Index, MultiIndex, list,
+            np.ndarray or a list containing only column labels, Series, Index,
+            MultiIndex, list, np.ndarray.
         drop : boolean, default True
-            Delete columns to be used as the new index
+            Delete columns to be used as the new index.
         append : boolean, default False
-            Whether to append columns to existing index
+            Whether to append columns to existing index.
         inplace : boolean, default False
-            Modify the DataFrame in place (do not create a new object)
+            Modify the DataFrame in place (do not create a new object).
         verify_integrity : boolean, default False
             Check the new index for duplicates. Otherwise defer the check until
             necessary. Setting to False will improve the performance of this
-            method
+            method.
+
+        Returns
+        -------
+        reindexed : DataFrame if inplace is False, else None
+
+        See Also
+        --------
+        Series.set_index: Corresponding method for Series
 
         Examples
         --------
         >>> df = pd.DataFrame({'month': [1, 4, 7, 10],
         ...                    'year': [2012, 2014, 2013, 2014],
-        ...                    'sale':[55, 40, 84, 31]})
-           month  sale  year
-        0  1      55    2012
-        1  4      40    2014
-        2  7      84    2013
-        3  10     31    2014
+        ...                    'sale': [55, 40, 84, 31]})
+        >>> df
+           month  year  sale
+        0      1  2012    55
+        1      4  2014    40
+        2      7  2013    84
+        3     10  2014    31
 
         Set the index to become the 'month' column:
 
         >>> df.set_index('month')
-               sale  year
+               year  sale
         month
-        1      55    2012
-        4      40    2014
-        7      84    2013
-        10     31    2014
+        1      2012    55
+        4      2014    40
+        7      2013    84
+        10     2014    31
 
-        Create a multi-index using columns 'year' and 'month':
+        Create a MultiIndex using columns 'year' and 'month':
 
         >>> df.set_index(['year', 'month'])
                     sale
@@ -3971,7 +3983,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
         2013  7     84
         2014  10    31
 
-        Create a multi-index using a set of values and a column:
+        Create a MultiIndex using a set of values and a column:
 
         >>> df.set_index([[1, 2, 3, 4], 'year'])
                  month  sale
@@ -3980,12 +3992,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
         2  2014  4      40
         3  2013  7      84
         4  2014  10     31
-
-        Returns
-        -------
-        dataframe : DataFrame
         """
-        inplace = validate_bool_kwarg(inplace, 'inplace')
         if not isinstance(keys, list):
             keys = [keys]
 
@@ -4008,65 +4015,10 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
         if missing:
             raise KeyError('{}'.format(missing))
 
-        if inplace:
-            frame = self
-        else:
-            frame = self.copy()
-
-        arrays = []
-        names = []
-        if append:
-            names = [x for x in self.index.names]
-            if isinstance(self.index, ABCMultiIndex):
-                for i in range(self.index.nlevels):
-                    arrays.append(self.index._get_level_values(i))
-            else:
-                arrays.append(self.index)
-
-        to_remove = []
-        for col in keys:
-            if isinstance(col, ABCMultiIndex):
-                for n in range(col.nlevels):
-                    arrays.append(col._get_level_values(n))
-                names.extend(col.names)
-            elif isinstance(col, (ABCIndexClass, ABCSeries)):
-                # if Index then not MultiIndex (treated above)
-                arrays.append(col)
-                names.append(col.name)
-            elif isinstance(col, (list, np.ndarray)):
-                arrays.append(col)
-                names.append(None)
-            elif (is_list_like(col)
-                  and not (isinstance(col, tuple) and col in self)):
-                # all other list-likes (but avoid valid column keys)
-                col = list(col)  # ensure iterator do not get read twice etc.
-                arrays.append(col)
-                names.append(None)
-            # from here, col can only be a column label
-            else:
-                arrays.append(frame[col]._values)
-                names.append(col)
-                if drop:
-                    to_remove.append(col)
-
-        index = ensure_index_from_sequences(arrays, names)
-
-        if verify_integrity and not index.is_unique:
-            duplicates = index[index.duplicated()].unique()
-            raise ValueError('Index has duplicate keys: {dup}'.format(
-                dup=duplicates))
-
-        # use set to handle duplicate column names gracefully in case of drop
-        for c in set(to_remove):
-            del frame[c]
-
-        # clear up memory usage
-        index._cleanup()
-
-        frame.index = index
-
-        if not inplace:
-            return frame
+        vi = verify_integrity
+        return super(DataFrame, self).set_index(keys=keys, drop=drop,
+                                                append=append, inplace=inplace,
+                                                verify_integrity=vi)
 
     def reset_index(self, level=None, drop=False, inplace=False, col_level=0,
                     col_fill=''):

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -32,11 +32,13 @@
 from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask
 from pandas.core.dtypes.inference import is_hashable
 from pandas.core.dtypes.missing import isna, notna
-from pandas.core.dtypes.generic import ABCSeries, ABCPanel, ABCDataFrame
+from pandas.core.dtypes.generic import (ABCIndexClass, ABCMultiIndex, ABCPanel,
+                                        ABCSeries, ABCDataFrame)
 
 from pandas.core.base import PandasObject, SelectionMixin
-from pandas.core.index import (Index, MultiIndex, ensure_index,
-                               InvalidIndexError, RangeIndex)
+from pandas.core.index import (Index, MultiIndex,
+                               InvalidIndexError, RangeIndex,
+                               ensure_index, ensure_index_from_sequences)
 import pandas.core.indexing as indexing
 from pandas.core.indexes.datetimes import DatetimeIndex
 from pandas.core.indexes.period import PeriodIndex, Period
@@ -643,6 +645,142 @@ def _set_axis(self, axis, labels):
         self._data.set_axis(axis, labels)
         self._clear_item_cache()
 
+    def set_index(self, keys, drop=True, append=False, inplace=False,
+                  verify_integrity=False):
+        """
+        Set the index (row labels) using one or more given arrays (or labels).
+
+        Parameters
+        ----------
+        keys : column label or list of column labels / arrays
+            Either a Series, Index, MultiIndex, list, np.ndarray or a list
+            containing only Series, Index, MultiIndex, list, np.ndarray.
+
+            For DataFrame, additionally column labels may be used.
+        drop : boolean, default True
+            Delete columns to be used as the new index (only for DataFrame).
+        append : boolean, default False
+            Whether to append columns to existing index.
+        inplace : boolean, default False
+            Modify the Series/DataFrame in place (do not create a new object).
+        verify_integrity : boolean, default False
+            Check the new index for duplicates. Otherwise defer the check until
+            necessary. Setting to False will improve the performance of this
+            method.
+
+        Returns
+        -------
+        reindexed : Series/DataFrame if inplace is False, else None
+
+        See Also
+        --------
+        DataFrame.set_index: method adapted for DataFrame
+        Series.set_index: method adapted for Series
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({'month': [1, 4, 7, 10],
+        ...                    'year': [2012, 2014, 2013, 2014],
+        ...                    'sale': [55, 40, 84, 31]})
+        >>> df
+           month  year  sale
+        0      1  2012    55
+        1      4  2014    40
+        2      7  2013    84
+        3     10  2014    31
+
+        Set the index to become the 'month' column:
+
+        >>> df.set_index('month')
+               year  sale
+        month
+        1      2012    55
+        4      2014    40
+        7      2013    84
+        10     2014    31
+
+        Create a MultiIndex using columns 'year' and 'month':
+
+        >>> df.set_index(['year', 'month'])
+                    sale
+        year  month
+        2012  1     55
+        2014  4     40
+        2013  7     84
+        2014  10    31
+
+        Create a MultiIndex using a set of values and a column:
+
+        >>> df.set_index([[1, 2, 3, 4], 'year'])
+                 month  sale
+           year
+        1  2012  1      55
+        2  2014  4      40
+        3  2013  7      84
+        4  2014  10     31
+        """
+        # parameter keys is checked in Series.set_index / DataFrame.set_index!
+        inplace = validate_bool_kwarg(inplace, 'inplace')
+        if inplace:
+            obj = self
+        else:
+            obj = self.copy()
+
+        arrays = []
+        names = []
+        if append:
+            names = [x for x in self.index.names]
+            if isinstance(self.index, ABCMultiIndex):
+                for i in range(self.index.nlevels):
+                    arrays.append(self.index._get_level_values(i))
+            else:
+                arrays.append(self.index)
+
+        to_remove = []
+        for col in keys:
+            if isinstance(col, ABCMultiIndex):
+                for n in range(col.nlevels):
+                    arrays.append(col._get_level_values(n))
+                names.extend(col.names)
+            elif isinstance(col, (ABCIndexClass, ABCSeries)):
+                # if Index then not MultiIndex (treated above)
+                arrays.append(col)
+                names.append(col.name)
+            elif isinstance(col, (list, np.ndarray)):
+                arrays.append(col)
+                names.append(None)
+            elif (is_list_like(col)
+                  and not (isinstance(col, tuple) and col in self)):
+                # all other list-likes (but avoid valid column keys)
+                col = list(col)  # ensure iterator do not get read twice etc.
+                arrays.append(col)
+                names.append(None)
+            # from here, col can only be a column label
+            else:
+                arrays.append(obj[col]._values)
+                names.append(col)
+                if drop:
+                    to_remove.append(col)
+
+        index = ensure_index_from_sequences(arrays, names)
+
+        if verify_integrity and not index.is_unique:
+            duplicates = list(index[index.duplicated()])
+            raise ValueError('Index has duplicate keys: {dup}'.format(
+                dup=duplicates))
+
+        # use set to handle duplicate column names gracefully in case of drop
+        for c in set(to_remove):
+            del obj[c]
+
+        # clear up memory usage
+        index._cleanup()
+
+        obj.index = index
+
+        if not inplace:
+            return obj
+
     def transpose(self, *args, **kwargs):
         """
         Permute the dimensions of the %(klass)s