Skip to content

Commit

Permalink
ENH: add set_index to Series
Browse files Browse the repository at this point in the history
  • Loading branch information
h-vetinari committed Oct 19, 2018
1 parent 145c227 commit 949e699
Show file tree
Hide file tree
Showing 5 changed files with 385 additions and 95 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ Other Enhancements
The default compression for ``to_csv``, ``to_json``, and ``to_pickle`` methods has been updated to ``'infer'`` (:issue:`22004`).
- :func:`to_timedelta` now supports iso-formated timedelta strings (:issue:`21877`)
- :class:`Series` and :class:`DataFrame` now support :class:`Iterable` in constructor (:issue:`2193`)
- :class:`Series` has gained the method :meth:`Series.set_index`, which works like its :class:`DataFrame` counterpart :meth:`DataFrame.set_index` (:issue:`21684`)
- :class:`DatetimeIndex` gained :attr:`DatetimeIndex.timetz` attribute. Returns local time with timezone information. (:issue:`21358`)
- :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support an ``ambiguous`` argument for handling datetimes that are rounded to ambiguous times (:issue:`18946`)
- :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`).
Expand Down
118 changes: 35 additions & 83 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
from pandas.core.accessor import CachedAccessor
from pandas.core.arrays import Categorical, ExtensionArray
from pandas.core.config import get_option

from pandas.core.generic import NDFrame, _shared_docs
from pandas.core.index import (Index, MultiIndex, ensure_index,
ensure_index_from_sequences)
Expand Down Expand Up @@ -3923,45 +3924,56 @@ def shift(self, periods=1, freq=None, axis=0):
def set_index(self, keys, drop=True, append=False, inplace=False,
verify_integrity=False):
"""
Set the DataFrame index (row labels) using one or more existing
columns. By default yields a new object.
Set the DataFrame index (row labels) using one or more columns.
Parameters
----------
keys : column label or list of column labels / arrays
Either a column label, Series, Index, MultiIndex, list,
np.ndarray or a list containing only column labels, Series, Index,
MultiIndex, list, np.ndarray.
drop : boolean, default True
Delete columns to be used as the new index
Delete columns to be used as the new index.
append : boolean, default False
Whether to append columns to existing index
Whether to append columns to existing index.
inplace : boolean, default False
Modify the DataFrame in place (do not create a new object)
Modify the DataFrame in place (do not create a new object).
verify_integrity : boolean, default False
Check the new index for duplicates. Otherwise defer the check until
necessary. Setting to False will improve the performance of this
method
method.
Returns
-------
reindexed : DataFrame if inplace is False, else None
See Also
--------
Series.set_index: Corresponding method for Series
Examples
--------
>>> df = pd.DataFrame({'month': [1, 4, 7, 10],
... 'year': [2012, 2014, 2013, 2014],
... 'sale':[55, 40, 84, 31]})
month sale year
0 1 55 2012
1 4 40 2014
2 7 84 2013
3 10 31 2014
... 'sale': [55, 40, 84, 31]})
>>> df
month year sale
0 1 2012 55
1 4 2014 40
2 7 2013 84
3 10 2014 31
Set the index to become the 'month' column:
>>> df.set_index('month')
sale year
year sale
month
1 55 2012
4 40 2014
7 84 2013
10 31 2014
1 2012 55
4 2014 40
7 2013 84
10 2014 31
Create a multi-index using columns 'year' and 'month':
Create a MultiIndex using columns 'year' and 'month':
>>> df.set_index(['year', 'month'])
sale
Expand All @@ -3971,7 +3983,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
2013 7 84
2014 10 31
Create a multi-index using a set of values and a column:
Create a MultiIndex using a set of values and a column:
>>> df.set_index([[1, 2, 3, 4], 'year'])
month sale
Expand All @@ -3980,12 +3992,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
2 2014 4 40
3 2013 7 84
4 2014 10 31
Returns
-------
dataframe : DataFrame
"""
inplace = validate_bool_kwarg(inplace, 'inplace')
if not isinstance(keys, list):
keys = [keys]

Expand All @@ -4008,65 +4015,10 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
if missing:
raise KeyError('{}'.format(missing))

if inplace:
frame = self
else:
frame = self.copy()

arrays = []
names = []
if append:
names = [x for x in self.index.names]
if isinstance(self.index, ABCMultiIndex):
for i in range(self.index.nlevels):
arrays.append(self.index._get_level_values(i))
else:
arrays.append(self.index)

to_remove = []
for col in keys:
if isinstance(col, ABCMultiIndex):
for n in range(col.nlevels):
arrays.append(col._get_level_values(n))
names.extend(col.names)
elif isinstance(col, (ABCIndexClass, ABCSeries)):
# if Index then not MultiIndex (treated above)
arrays.append(col)
names.append(col.name)
elif isinstance(col, (list, np.ndarray)):
arrays.append(col)
names.append(None)
elif (is_list_like(col)
and not (isinstance(col, tuple) and col in self)):
# all other list-likes (but avoid valid column keys)
col = list(col) # ensure iterator do not get read twice etc.
arrays.append(col)
names.append(None)
# from here, col can only be a column label
else:
arrays.append(frame[col]._values)
names.append(col)
if drop:
to_remove.append(col)

index = ensure_index_from_sequences(arrays, names)

if verify_integrity and not index.is_unique:
duplicates = index[index.duplicated()].unique()
raise ValueError('Index has duplicate keys: {dup}'.format(
dup=duplicates))

# use set to handle duplicate column names gracefully in case of drop
for c in set(to_remove):
del frame[c]

# clear up memory usage
index._cleanup()

frame.index = index

if not inplace:
return frame
vi = verify_integrity
return super(DataFrame, self).set_index(keys=keys, drop=drop,
append=append, inplace=inplace,
verify_integrity=vi)

def reset_index(self, level=None, drop=False, inplace=False, col_level=0,
col_fill=''):
Expand Down
144 changes: 141 additions & 3 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,13 @@
from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask
from pandas.core.dtypes.inference import is_hashable
from pandas.core.dtypes.missing import isna, notna
from pandas.core.dtypes.generic import ABCSeries, ABCPanel, ABCDataFrame
from pandas.core.dtypes.generic import (ABCIndexClass, ABCMultiIndex, ABCPanel,
ABCSeries, ABCDataFrame)

from pandas.core.base import PandasObject, SelectionMixin
from pandas.core.index import (Index, MultiIndex, ensure_index,
InvalidIndexError, RangeIndex)
from pandas.core.index import (Index, MultiIndex,
InvalidIndexError, RangeIndex,
ensure_index, ensure_index_from_sequences)
import pandas.core.indexing as indexing
from pandas.core.indexes.datetimes import DatetimeIndex
from pandas.core.indexes.period import PeriodIndex, Period
Expand Down Expand Up @@ -643,6 +645,142 @@ def _set_axis(self, axis, labels):
self._data.set_axis(axis, labels)
self._clear_item_cache()

def set_index(self, keys, drop=True, append=False, inplace=False,
verify_integrity=False):
"""
Set the index (row labels) using one or more given arrays (or labels).
Parameters
----------
keys : column label or list of column labels / arrays
Either a Series, Index, MultiIndex, list, np.ndarray or a list
containing only Series, Index, MultiIndex, list, np.ndarray.
For DataFrame, additionally column labels may be used.
drop : boolean, default True
Delete columns to be used as the new index (only for DataFrame).
append : boolean, default False
Whether to append columns to existing index.
inplace : boolean, default False
Modify the Series/DataFrame in place (do not create a new object).
verify_integrity : boolean, default False
Check the new index for duplicates. Otherwise defer the check until
necessary. Setting to False will improve the performance of this
method.
Returns
-------
reindexed : Series/DataFrame if inplace is False, else None
See Also
--------
DataFrame.set_index: method adapted for DataFrame
Series.set_index: method adapted for Series
Examples
--------
>>> df = pd.DataFrame({'month': [1, 4, 7, 10],
... 'year': [2012, 2014, 2013, 2014],
... 'sale': [55, 40, 84, 31]})
>>> df
month year sale
0 1 2012 55
1 4 2014 40
2 7 2013 84
3 10 2014 31
Set the index to become the 'month' column:
>>> df.set_index('month')
year sale
month
1 2012 55
4 2014 40
7 2013 84
10 2014 31
Create a MultiIndex using columns 'year' and 'month':
>>> df.set_index(['year', 'month'])
sale
year month
2012 1 55
2014 4 40
2013 7 84
2014 10 31
Create a MultiIndex using a set of values and a column:
>>> df.set_index([[1, 2, 3, 4], 'year'])
month sale
year
1 2012 1 55
2 2014 4 40
3 2013 7 84
4 2014 10 31
"""
# parameter keys is checked in Series.set_index / DataFrame.set_index!
inplace = validate_bool_kwarg(inplace, 'inplace')
if inplace:
obj = self
else:
obj = self.copy()

arrays = []
names = []
if append:
names = [x for x in self.index.names]
if isinstance(self.index, ABCMultiIndex):
for i in range(self.index.nlevels):
arrays.append(self.index._get_level_values(i))
else:
arrays.append(self.index)

to_remove = []
for col in keys:
if isinstance(col, ABCMultiIndex):
for n in range(col.nlevels):
arrays.append(col._get_level_values(n))
names.extend(col.names)
elif isinstance(col, (ABCIndexClass, ABCSeries)):
# if Index then not MultiIndex (treated above)
arrays.append(col)
names.append(col.name)
elif isinstance(col, (list, np.ndarray)):
arrays.append(col)
names.append(None)
elif (is_list_like(col)
and not (isinstance(col, tuple) and col in self)):
# all other list-likes (but avoid valid column keys)
col = list(col) # ensure iterator do not get read twice etc.
arrays.append(col)
names.append(None)
# from here, col can only be a column label
else:
arrays.append(obj[col]._values)
names.append(col)
if drop:
to_remove.append(col)

index = ensure_index_from_sequences(arrays, names)

if verify_integrity and not index.is_unique:
duplicates = list(index[index.duplicated()])
raise ValueError('Index has duplicate keys: {dup}'.format(
dup=duplicates))

# use set to handle duplicate column names gracefully in case of drop
for c in set(to_remove):
del obj[c]

# clear up memory usage
index._cleanup()

obj.index = index

if not inplace:
return obj

def transpose(self, *args, **kwargs):
"""
Permute the dimensions of the %(klass)s
Expand Down
Loading

0 comments on commit 949e699

Please sign in to comment.