diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 3be87c4cabaf0..5f6a0cbea141b 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -391,6 +391,7 @@ Other Enhancements - :meth:`DataFrame.to_sql` now supports writing ``TIMESTAMP WITH TIME ZONE`` types for supported databases. For databases that don't support timezones, datetime data will be stored as timezone unaware local timestamps. See the :ref:`io.sql_datetime_data` for implications (:issue:`9086`). - :func:`to_timedelta` now supports iso-formated timedelta strings (:issue:`21877`) - :class:`Series` and :class:`DataFrame` now support :class:`Iterable` in constructor (:issue:`2193`) +- :class:`Series` has gained the method :meth:`Series.set_index`, which works like its :class:`DataFrame` counterpart :meth:`DataFrame.set_index` (:issue:`21684`) - :class:`DatetimeIndex` has gained the :attr:`DatetimeIndex.timetz` attribute. This returns the local time with timezone information. (:issue:`21358`) - :meth:`Timestamp.round`, :meth:`Timestamp.ceil`, and :meth:`Timestamp.floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support an ``ambiguous`` argument for handling datetimes that are rounded to ambiguous times (:issue:`18946`) - :meth:`Timestamp.round`, :meth:`Timestamp.ceil`, and :meth:`Timestamp.floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support a ``nonexistent`` argument for handling datetimes that are rounded to nonexistent times. See :ref:`timeseries.timezone_nonexistent` (:issue:`22647`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a50def7357826..26f3c1c3948d5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -69,7 +69,7 @@ is_iterator, is_sequence, is_named_tuple) -from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass, ABCMultiIndex +from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass from pandas.core.dtypes.missing import isna, notna from pandas.core import algorithms @@ -4035,83 +4035,10 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): return super(DataFrame, self).shift(periods=periods, freq=freq, axis=axis, fill_value=fill_value) + @Appender(NDFrame.set_index.__doc__) def set_index(self, keys, drop=True, append=False, inplace=False, verify_integrity=False): - """ - Set the DataFrame index using existing columns. - - Set the DataFrame index (row labels) using one or more existing - columns. The index can replace the existing index or expand on it. - - Parameters - ---------- - keys : label or list of label - Name or names of the columns that will be used as the index. - drop : bool, default True - Delete columns to be used as the new index. - append : bool, default False - Whether to append columns to existing index. - inplace : bool, default False - Modify the DataFrame in place (do not create a new object). - verify_integrity : bool, default False - Check the new index for duplicates. Otherwise defer the check until - necessary. Setting to False will improve the performance of this - method. - Returns - ------- - DataFrame - Changed row labels. - - See Also - -------- - DataFrame.reset_index : Opposite of set_index. - DataFrame.reindex : Change to new indices or expand indices. - DataFrame.reindex_like : Change to same indices as other DataFrame. - - Examples - -------- - >>> df = pd.DataFrame({'month': [1, 4, 7, 10], - ... 'year': [2012, 2014, 2013, 2014], - ... 'sale': [55, 40, 84, 31]}) - >>> df - month year sale - 0 1 2012 55 - 1 4 2014 40 - 2 7 2013 84 - 3 10 2014 31 - - Set the index to become the 'month' column: - - >>> df.set_index('month') - year sale - month - 1 2012 55 - 4 2014 40 - 7 2013 84 - 10 2014 31 - - Create a multi-index using columns 'year' and 'month': - - >>> df.set_index(['year', 'month']) - sale - year month - 2012 1 55 - 2014 4 40 - 2013 7 84 - 2014 10 31 - - Create a multi-index using a set of values and a column: - - >>> df.set_index([[1, 2, 3, 4], 'year']) - month sale - year - 1 2012 1 55 - 2 2014 4 40 - 3 2013 7 84 - 4 2014 10 31 - """ - inplace = validate_bool_kwarg(inplace, 'inplace') if not isinstance(keys, list): keys = [keys] @@ -4134,65 +4061,9 @@ def set_index(self, keys, drop=True, append=False, inplace=False, if missing: raise KeyError('{}'.format(missing)) - if inplace: - frame = self - else: - frame = self.copy() - - arrays = [] - names = [] - if append: - names = [x for x in self.index.names] - if isinstance(self.index, ABCMultiIndex): - for i in range(self.index.nlevels): - arrays.append(self.index._get_level_values(i)) - else: - arrays.append(self.index) - - to_remove = [] - for col in keys: - if isinstance(col, ABCMultiIndex): - for n in range(col.nlevels): - arrays.append(col._get_level_values(n)) - names.extend(col.names) - elif isinstance(col, (ABCIndexClass, ABCSeries)): - # if Index then not MultiIndex (treated above) - arrays.append(col) - names.append(col.name) - elif isinstance(col, (list, np.ndarray)): - arrays.append(col) - names.append(None) - elif (is_list_like(col) - and not (isinstance(col, tuple) and col in self)): - # all other list-likes (but avoid valid column keys) - col = list(col) # ensure iterator do not get read twice etc. - arrays.append(col) - names.append(None) - # from here, col can only be a column label - else: - arrays.append(frame[col]._values) - names.append(col) - if drop: - to_remove.append(col) - - index = ensure_index_from_sequences(arrays, names) - - if verify_integrity and not index.is_unique: - duplicates = index[index.duplicated()].unique() - raise ValueError('Index has duplicate keys: {dup}'.format( - dup=duplicates)) - - # use set to handle duplicate column names gracefully in case of drop - for c in set(to_remove): - del frame[c] - - # clear up memory usage - index._cleanup() - - frame.index = index - - if not inplace: - return frame + return super(DataFrame, self).set_index( + keys=keys, drop=drop, append=append, inplace=inplace, + verify_integrity=verify_integrity) def reset_index(self, level=None, drop=False, inplace=False, col_level=0, col_fill=''): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d0555bd2e44b1..00193249fe828 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -29,7 +29,8 @@ is_extension_array_dtype, is_integer, is_list_like, is_number, is_numeric_dtype, is_object_dtype, is_period_arraylike, is_re_compilable, is_scalar, is_timedelta64_dtype, pandas_dtype) -from pandas.core.dtypes.generic import ABCDataFrame, ABCPanel, ABCSeries +from pandas.core.dtypes.generic import ( + ABCDataFrame, ABCIndexClass, ABCMultiIndex, ABCPanel, ABCSeries) from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import isna, notna @@ -39,7 +40,8 @@ from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com from pandas.core.index import ( - Index, InvalidIndexError, MultiIndex, RangeIndex, ensure_index) + Index, InvalidIndexError, MultiIndex, RangeIndex, ensure_index, + ensure_index_from_sequences) from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.period import Period, PeriodIndex import pandas.core.indexing as indexing @@ -629,6 +631,155 @@ def _set_axis(self, axis, labels): self._data.set_axis(axis, labels) self._clear_item_cache() + _shared_docs['set_index'] = """ + Set the index (row labels) using one or more given arrays (or labels). + + Parameters + ---------- + %(params)s + append : bool, default False + Whether to append columns to existing index. + inplace : bool, default False + Modify the %(klass)s in place (do not create a new object). + verify_integrity : bool, default False + Check the new index for duplicates. Otherwise defer the check until + necessary. Setting to False will improve the performance of this + method. + + Returns + ------- + %(klass)s + The reindexed %(klass)s. Will be None if inplace is True. + + See Also + -------- + %(other_klass)s.set_index : Method adapted for %(other_klass)s. + %(klass)s.reset_index : Opposite of set_index. + %(klass)s.reindex : Change to new indices or expand indices. + %(klass)s.reindex_like : Change to same indices as other %(klass)s. + + Examples + -------- + %(examples)s + """ + + @Substitution( + klass='DataFrame', other_klass='Series', + params=dedent("""\ + keys : column label or list of column labels / arrays + Either a column label, Series, Index, MultiIndex, list, np.ndarray + or a list containing only column labels, Series, Index, MultiIndex, + list np.ndarray. + drop : bool, default True + Delete columns to be used as the new index."""), + examples=dedent("""\ + >>> df = pd.DataFrame({'month': [1, 4, 7, 10], + ... 'year': [2012, 2014, 2013, 2014], + ... 'sale': [55, 40, 84, 31]}) + >>> df + month year sale + 0 1 2012 55 + 1 4 2014 40 + 2 7 2013 84 + 3 10 2014 31 + + Set the index to become the 'month' column: + + >>> df.set_index('month') + year sale + month + 1 2012 55 + 4 2014 40 + 7 2013 84 + 10 2014 31 + + Create a MultiIndex using columns 'year' and 'month': + + >>> df.set_index(['year', 'month']) + sale + year month + 2012 1 55 + 2014 4 40 + 2013 7 84 + 2014 10 31 + + Create a MultiIndex using a set of values and a column: + + >>> df.set_index([[1, 2, 3, 4], 'year']) + month sale + year + 1 2012 1 55 + 2 2014 4 40 + 3 2013 7 84 + 4 2014 10 31""") + ) + @Appender(_shared_docs["set_index"]) + def set_index(self, keys, drop=True, append=False, inplace=False, + verify_integrity=False): + # parameter keys is checked in Series.set_index / DataFrame.set_index, + # will always be passed as a list of list-likes! + + inplace = validate_bool_kwarg(inplace, 'inplace') + if inplace: + obj = self + else: + obj = self.copy() + + arrays = [] + names = [] + if append: + names = [x for x in self.index.names] + if isinstance(self.index, ABCMultiIndex): + for i in range(self.index.nlevels): + arrays.append(self.index._get_level_values(i)) + else: + arrays.append(self.index) + + to_remove = [] + for col in keys: + if isinstance(col, ABCMultiIndex): + for n in range(col.nlevels): + arrays.append(col._get_level_values(n)) + names.extend(col.names) + elif isinstance(col, (ABCIndexClass, ABCSeries)): + # if Index then not MultiIndex (treated above) + arrays.append(col) + names.append(col.name) + elif isinstance(col, (list, np.ndarray)): + arrays.append(col) + names.append(None) + elif (is_list_like(col) + and not (isinstance(col, tuple) and col in self)): + # all other list-likes (but avoid valid column keys) + col = list(col) # ensure iterator do not get read twice etc. + arrays.append(col) + names.append(None) + # from here, col can only be a column label + else: + arrays.append(obj[col]._values) + names.append(col) + if drop: + to_remove.append(col) + + index = ensure_index_from_sequences(arrays, names) + + if verify_integrity and not index.is_unique: + duplicates = list(index[index.duplicated()]) + raise ValueError('Index has duplicate keys: {dup}'.format( + dup=duplicates)) + + # use set to handle duplicate column names gracefully in case of drop + for c in set(to_remove): + del obj[c] + + # clear up memory usage + index._cleanup() + + obj.index = index + + if not inplace: + return obj + def transpose(self, *args, **kwargs): """ Permute the dimensions of the %(klass)s diff --git a/pandas/core/series.py b/pandas/core/series.py index de34227cda28a..e4df55c9afcb4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1171,6 +1171,71 @@ def _set_value(self, label, value, takeable=False): return self _set_value.__doc__ = set_value.__doc__ + @Substitution( + klass='Series', other_klass='DataFrame', + params=dedent("""\ + arrays : array or list of arrays + Either a Series, Index, MultiIndex, list, np.ndarray or a list + containing only Series, Index, MultiIndex, list, np.ndarray."""), + examples=dedent("""\ + >>> s = pd.Series(range(10, 13)) + >>> s + 0 10 + 1 11 + 2 12 + dtype: int64 + + Set the index to become `['a', 'b', 'c']`: + + >>> s.set_index(['a', 'b', 'c']) + a 10 + b 11 + c 12 + dtype: int64 + + Create a MultiIndex by appending to the existing index: + + >>> s.set_index(['a', 'b', 'c'], append=True) + 0 a 10 + 1 b 11 + 2 c 12 + dtype: int64 + + Create a MultiIndex by passing a list of arrays: + + >>> t = (s ** 2).set_index([['a', 'b', 'c'], ['I', 'II', 'III']]) + >>> t + a I 100 + b II 121 + c III 144 + dtype: int64 + + Apply index from another object (of the same length!): + + >>> s.set_index(t.index) + a I 10 + b II 11 + c III 12 + dtype: int64""") + ) + @Appender(generic._shared_docs['set_index']) + def set_index(self, arrays, append=False, inplace=False, + verify_integrity=False): + + # NDFrame.set_index expects a list of list-likes. Lists of scalars + # must be wrapped in another list to avoid being interpreted as keys. + if not isinstance(arrays, list) or all(is_scalar(x) for x in arrays): + arrays = [arrays] + + if any(not is_list_like(x, allow_sets=False) + or getattr(x, 'ndim', 1) > 1 for x in arrays): + raise TypeError('The parameter "arrays" may only contain ' + 'one-dimensional list-likes') + + return super(Series, self).set_index(keys=arrays, drop=False, + append=append, inplace=inplace, + verify_integrity=verify_integrity) + def reset_index(self, level=None, drop=False, name=None, inplace=False): """ Generate a new DataFrame or Series with the index reset. diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py index 99a4f0c424ce9..d77f833965233 100644 --- a/pandas/tests/series/test_alter_axes.py +++ b/pandas/tests/series/test_alter_axes.py @@ -14,17 +14,139 @@ class TestSeriesAlterAxes(object): - def test_setindex(self, string_series): - # wrong type - pytest.raises(TypeError, setattr, string_series, 'index', None) + def test_set_index_directly(self, string_series): + idx = Index(np.arange(len(string_series))[::-1]) + + string_series.index = idx + tm.assert_index_equal(string_series.index, idx) + with pytest.raises(ValueError, match='Length mismatch'): + string_series.index = idx[::2] + + # MultiIndex constructor does not work directly on Series -> lambda + @pytest.mark.parametrize('box', [Series, Index, np.array, + list, tuple, iter, + lambda x: MultiIndex.from_arrays([x])]) + @pytest.mark.parametrize('inplace', [True, False]) + def test_set_index(self, string_series, inplace, box): + idx = box(string_series.index[::-1]) + + expected = Index(string_series.index[::-1]) + + if inplace: + result = string_series.copy() + result.set_index(idx, inplace=True) + else: + result = string_series.set_index(idx) + + tm.assert_index_equal(result.index, expected) + with pytest.raises(ValueError, match='Length mismatch'): + string_series.set_index(string_series.index[::2], inplace=inplace) + + def test_set_index_cast(self): + # issue casting an index then set_index + s = Series([1.1, 2.2, 3.3], index=[2010, 2011, 2012]) + s2 = s.set_index(s.index.astype(np.int32)) + tm.assert_series_equal(s, s2) + + # MultiIndex constructor does not work directly on Series -> lambda + # also test index name if append=True (name is duplicate here for 'B') + @pytest.mark.parametrize('box', [Series, Index, np.array, + list, tuple, iter, + lambda x: MultiIndex.from_arrays([x])]) + @pytest.mark.parametrize('index_name', [None, 'B', 'test']) + def test_set_index_append(self, string_series, index_name, box): + string_series.index.name = index_name + + arrays = box(string_series.index[::-1]) + # np.array/list/tuple/iter "forget" the name of series.index + names = [index_name, + None if box in [np.array, list, tuple, iter] else index_name] + + idx = MultiIndex.from_arrays([string_series.index, + string_series.index[::-1]], + names=names) + expected = string_series.copy() + expected.index = idx + + result = string_series.set_index(arrays, append=True) + + tm.assert_series_equal(result, expected) + + def test_set_index_append_to_multiindex(self, string_series): + s = string_series.set_index(string_series.index[::-1], append=True) + + idx = np.random.randn(len(s)) + expected = string_series.set_index([string_series.index[::-1], idx], + append=True) + + result = s.set_index(idx, append=True) - # wrong length - pytest.raises(Exception, setattr, string_series, 'index', - np.arange(len(string_series) - 1)) + tm.assert_series_equal(result, expected) + + # MultiIndex constructor does not work directly on Series -> lambda + # also test index name if append=True (name is duplicate here for 'B') + @pytest.mark.parametrize('box', [Series, Index, np.array, + list, tuple, iter, + lambda x: MultiIndex.from_arrays([x])]) + @pytest.mark.parametrize('append, index_name', [(True, None), (True, 'B'), + (True, 'test'), (False, None)]) + def test_set_index_pass_arrays(self, string_series, append, + index_name, box): + string_series.index.name = index_name + + idx = string_series.index[::-1] + idx.name = 'B' + arrays = [box(idx), np.random.randn(len(string_series))] + + result = string_series.set_index(arrays, append=append) + + if box == iter: + # content was consumed -> re-read + arrays[0] = box(idx) + + # to test against already-tested behavior, we add sequentially, + # hence second append always True; must wrap keys in list, otherwise + # box = list would be illegal + expected = string_series.set_index([arrays[0]], append=append) + expected = expected.set_index([arrays[1]], append=True) + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('append', [True, False]) + def test_set_index_pass_multiindex(self, string_series, append): + arrays = MultiIndex.from_arrays([string_series.values, + string_series.index[::-1]], + names=['A', 'B']) + + result = string_series.set_index(arrays, append=append) + + expected = string_series.set_index([string_series.values, + string_series.index[::-1]], + append=append) + expected.index.names = [None, 'A', 'B'] if append else ['A', 'B'] + + tm.assert_series_equal(result, expected) - # works - string_series.index = np.arange(len(string_series)) - assert isinstance(string_series.index, Index) + def test_set_index_verify_integrity(self, string_series): + idx = np.zeros(len(string_series)) + + with pytest.raises(ValueError, match='Index has duplicate keys'): + string_series.set_index(idx, verify_integrity=True) + # with MultiIndex + with pytest.raises(ValueError, match='Index has duplicate keys'): + string_series.set_index([idx, idx], verify_integrity=True) + + def test_set_index_raise(self, string_series): + msg = 'The parameter "arrays" may only contain one-dimensional.*' + # forbidden type, e.g. set + with pytest.raises(TypeError, match=msg): + string_series.set_index(set(string_series.index), + verify_integrity=True) + + # wrong type in list with arrays + with pytest.raises(TypeError, match=msg): + string_series.set_index([string_series.index, 'X'], + verify_integrity=True) # Renaming