Skip to content
Permalink
v1.0.3
Switch branches/tags
Go to file
 
 
Cannot retrieve contributors at this time
11368 lines (9610 sloc) 367 KB
import collections
from datetime import timedelta
import functools
import gc
import json
import operator
import pickle
import re
from textwrap import dedent
from typing import (
Any,
Callable,
Dict,
FrozenSet,
Hashable,
List,
Mapping,
Optional,
Sequence,
Set,
Tuple,
Type,
Union,
)
import warnings
import weakref
import numpy as np
from pandas._config import config
from pandas._libs import Timestamp, iNaT, lib, properties
from pandas._typing import (
Axis,
Dtype,
FilePathOrBuffer,
FrameOrSeries,
JSONSerializable,
Level,
Renamer,
)
from pandas.compat import set_function_name
from pandas.compat._optional import import_optional_dependency
from pandas.compat.numpy import function as nv
from pandas.errors import AbstractMethodError
from pandas.util._decorators import Appender, Substitution, rewrite_axis_style_signature
from pandas.util._validators import (
validate_bool_kwarg,
validate_fillna_kwargs,
validate_percentile,
)
from pandas.core.dtypes.common import (
ensure_int64,
ensure_object,
ensure_str,
is_bool,
is_bool_dtype,
is_datetime64_any_dtype,
is_datetime64tz_dtype,
is_dict_like,
is_extension_array_dtype,
is_float,
is_integer,
is_list_like,
is_number,
is_numeric_dtype,
is_object_dtype,
is_period_arraylike,
is_re_compilable,
is_scalar,
is_timedelta64_dtype,
pandas_dtype,
)
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
from pandas.core.dtypes.inference import is_hashable
from pandas.core.dtypes.missing import isna, notna
import pandas as pd
from pandas.core import missing, nanops
import pandas.core.algorithms as algos
from pandas.core.base import PandasObject, SelectionMixin
import pandas.core.common as com
from pandas.core.construction import create_series_with_explicit_dtype
from pandas.core.indexes.api import (
Index,
InvalidIndexError,
MultiIndex,
RangeIndex,
ensure_index,
)
from pandas.core.indexes.datetimes import DatetimeIndex
from pandas.core.indexes.period import Period, PeriodIndex
import pandas.core.indexing as indexing
from pandas.core.internals import BlockManager
from pandas.core.missing import find_valid_index
from pandas.core.ops import _align_method_FRAME
from pandas.io.formats import format as fmt
from pandas.io.formats.format import DataFrameFormatter, format_percentiles
from pandas.io.formats.printing import pprint_thing
from pandas.tseries.frequencies import to_offset
# goal is to be able to define the docs close to function, while still being
# able to share
_shared_docs: Dict[str, str] = dict()
_shared_doc_kwargs = dict(
axes="keywords for axes",
klass="Series/DataFrame",
axes_single_arg="int or labels for object",
args_transpose="axes to permute (int or label for object)",
optional_by="""
by : str or list of str
Name or list of names to sort by""",
)
def _single_replace(self, to_replace, method, inplace, limit):
"""
Replaces values in a Series using the fill method specified when no
replacement value is given in the replace method
"""
if self.ndim != 1:
raise TypeError(
f"cannot replace {to_replace} with method {method} on a "
f"{type(self).__name__}"
)
orig_dtype = self.dtype
result = self if inplace else self.copy()
fill_f = missing.get_fill_func(method)
mask = missing.mask_missing(result.values, to_replace)
values = fill_f(result.values, limit=limit, mask=mask)
if values.dtype == orig_dtype and inplace:
return
result = pd.Series(values, index=self.index, dtype=self.dtype).__finalize__(self)
if inplace:
self._update_inplace(result._data)
return
return result
bool_t = bool # Need alias because NDFrame has def bool:
class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin):
"""
N-dimensional analogue of DataFrame. Store multi-dimensional in a
size-mutable, labeled data structure
Parameters
----------
data : BlockManager
axes : list
copy : bool, default False
"""
_internal_names: List[str] = [
"_data",
"_cacher",
"_item_cache",
"_cache",
"_is_copy",
"_subtyp",
"_name",
"_index",
"_default_kind",
"_default_fill_value",
"_metadata",
"__array_struct__",
"__array_interface__",
]
_internal_names_set: Set[str] = set(_internal_names)
_accessors: Set[str] = set()
_deprecations: FrozenSet[str] = frozenset(["get_values", "ix"])
_metadata: List[str] = []
_is_copy = None
_data: BlockManager
_attrs: Dict[Optional[Hashable], Any]
_typ: str
# ----------------------------------------------------------------------
# Constructors
def __init__(
self,
data: BlockManager,
axes: Optional[List[Index]] = None,
copy: bool = False,
dtype: Optional[Dtype] = None,
attrs: Optional[Mapping[Optional[Hashable], Any]] = None,
fastpath: bool = False,
):
if not fastpath:
if dtype is not None:
data = data.astype(dtype)
elif copy:
data = data.copy()
if axes is not None:
for i, ax in enumerate(axes):
data = data.reindex_axis(ax, axis=i)
object.__setattr__(self, "_is_copy", None)
object.__setattr__(self, "_data", data)
object.__setattr__(self, "_item_cache", {})
if attrs is None:
attrs = {}
else:
attrs = dict(attrs)
object.__setattr__(self, "_attrs", attrs)
def _init_mgr(self, mgr, axes=None, dtype=None, copy=False):
""" passed a manager and a axes dict """
for a, axe in axes.items():
if axe is not None:
mgr = mgr.reindex_axis(
axe, axis=self._get_block_manager_axis(a), copy=False
)
# make a copy if explicitly requested
if copy:
mgr = mgr.copy()
if dtype is not None:
# avoid further copies if we can
if len(mgr.blocks) > 1 or mgr.blocks[0].values.dtype != dtype:
mgr = mgr.astype(dtype=dtype)
return mgr
# ----------------------------------------------------------------------
@property
def attrs(self) -> Dict[Optional[Hashable], Any]:
"""
Dictionary of global attributes on this object.
.. warning::
attrs is experimental and may change without warning.
"""
if self._attrs is None:
self._attrs = {}
return self._attrs
@attrs.setter
def attrs(self, value: Mapping[Optional[Hashable], Any]) -> None:
self._attrs = dict(value)
def _validate_dtype(self, dtype):
""" validate the passed dtype """
if dtype is not None:
dtype = pandas_dtype(dtype)
# a compound dtype
if dtype.kind == "V":
raise NotImplementedError(
"compound dtypes are not implemented"
f" in the {type(self).__name__} constructor"
)
return dtype
# ----------------------------------------------------------------------
# Construction
@property
def _constructor(self: FrameOrSeries) -> Type[FrameOrSeries]:
"""Used when a manipulation result has the same dimensions as the
original.
"""
raise AbstractMethodError(self)
@property
def _constructor_sliced(self):
"""Used when a manipulation result has one lower dimension(s) as the
original, such as DataFrame single columns slicing.
"""
raise AbstractMethodError(self)
@property
def _constructor_expanddim(self):
"""Used when a manipulation result has one higher dimension as the
original, such as Series.to_frame()
"""
raise NotImplementedError
# ----------------------------------------------------------------------
# Axis
_AXIS_ALIASES = {"rows": 0}
_AXIS_IALIASES = {0: "rows"}
_stat_axis_number = 0
_stat_axis_name = "index"
_ix = None
_AXIS_ORDERS: List[str]
_AXIS_NUMBERS: Dict[str, int]
_AXIS_NAMES: Dict[int, str]
_AXIS_REVERSED: bool
_info_axis_number: int
_info_axis_name: str
_AXIS_LEN: int
@classmethod
def _setup_axes(cls, axes: List[str], docs: Dict[str, str]) -> None:
"""
Provide axes setup for the major PandasObjects.
Parameters
----------
axes : the names of the axes in order (lowest to highest)
docs : docstrings for the axis properties
"""
info_axis = len(axes) - 1
axes_are_reversed = len(axes) > 1
cls._AXIS_ORDERS = axes
cls._AXIS_NUMBERS = {a: i for i, a in enumerate(axes)}
cls._AXIS_LEN = len(axes)
cls._AXIS_NAMES = dict(enumerate(axes))
cls._AXIS_REVERSED = axes_are_reversed
cls._info_axis_number = info_axis
cls._info_axis_name = axes[info_axis]
# setup the actual axis
def set_axis(a, i):
setattr(cls, a, properties.AxisProperty(i, docs.get(a, a)))
cls._internal_names_set.add(a)
if axes_are_reversed:
for i, a in cls._AXIS_NAMES.items():
set_axis(a, 1 - i)
else:
for i, a in cls._AXIS_NAMES.items():
set_axis(a, i)
def _construct_axes_dict(self, axes=None, **kwargs):
"""Return an axes dictionary for myself."""
d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)}
d.update(kwargs)
return d
@staticmethod
def _construct_axes_dict_from(self, axes, **kwargs):
"""Return an axes dictionary for the passed axes."""
d = {a: ax for a, ax in zip(self._AXIS_ORDERS, axes)}
d.update(kwargs)
return d
def _construct_axes_from_arguments(
self, args, kwargs, require_all: bool = False, sentinel=None
):
"""Construct and returns axes if supplied in args/kwargs.
If require_all, raise if all axis arguments are not supplied
return a tuple of (axes, kwargs).
sentinel specifies the default parameter when an axis is not
supplied; useful to distinguish when a user explicitly passes None
in scenarios where None has special meaning.
"""
# construct the args
args = list(args)
for a in self._AXIS_ORDERS:
# look for a argument by position
if a not in kwargs:
try:
kwargs[a] = args.pop(0)
except IndexError:
if require_all:
raise TypeError("not enough/duplicate arguments specified!")
axes = {a: kwargs.pop(a, sentinel) for a in self._AXIS_ORDERS}
return axes, kwargs
@classmethod
def _from_axes(cls: Type[FrameOrSeries], data, axes, **kwargs) -> FrameOrSeries:
# for construction from BlockManager
if isinstance(data, BlockManager):
return cls(data, **kwargs)
else:
if cls._AXIS_REVERSED:
axes = axes[::-1]
d = cls._construct_axes_dict_from(cls, axes, copy=False)
d.update(kwargs)
return cls(data, **d)
@classmethod
def _get_axis_number(cls, axis):
axis = cls._AXIS_ALIASES.get(axis, axis)
if is_integer(axis):
if axis in cls._AXIS_NAMES:
return axis
else:
try:
return cls._AXIS_NUMBERS[axis]
except KeyError:
pass
raise ValueError(f"No axis named {axis} for object type {cls}")
@classmethod
def _get_axis_name(cls, axis):
axis = cls._AXIS_ALIASES.get(axis, axis)
if isinstance(axis, str):
if axis in cls._AXIS_NUMBERS:
return axis
else:
try:
return cls._AXIS_NAMES[axis]
except KeyError:
pass
raise ValueError(f"No axis named {axis} for object type {cls}")
def _get_axis(self, axis):
name = self._get_axis_name(axis)
return getattr(self, name)
@classmethod
def _get_block_manager_axis(cls, axis):
"""Map the axis to the block_manager axis."""
axis = cls._get_axis_number(axis)
if cls._AXIS_REVERSED:
m = cls._AXIS_LEN - 1
return m - axis
return axis
def _get_axis_resolvers(self, axis: str) -> Dict[str, ABCSeries]:
# index or columns
axis_index = getattr(self, axis)
d = dict()
prefix = axis[0]
for i, name in enumerate(axis_index.names):
if name is not None:
key = level = name
else:
# prefix with 'i' or 'c' depending on the input axis
# e.g., you must do ilevel_0 for the 0th level of an unnamed
# multiiindex
key = f"{prefix}level_{i}"
level = i
level_values = axis_index.get_level_values(level)
s = level_values.to_series()
s.index = axis_index
d[key] = s
# put the index/columns itself in the dict
if isinstance(axis_index, MultiIndex):
dindex = axis_index
else:
dindex = axis_index.to_series()
d[axis] = dindex
return d
def _get_index_resolvers(self) -> Dict[str, ABCSeries]:
from pandas.core.computation.parsing import clean_column_name
d: Dict[str, ABCSeries] = {}
for axis_name in self._AXIS_ORDERS:
d.update(self._get_axis_resolvers(axis_name))
return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)}
def _get_cleaned_column_resolvers(self) -> Dict[str, ABCSeries]:
"""
Return the special character free column resolvers of a dataframe.
Column names with special characters are 'cleaned up' so that they can
be referred to by backtick quoting.
Used in :meth:`DataFrame.eval`.
"""
from pandas.core.computation.parsing import clean_column_name
if isinstance(self, ABCSeries):
return {clean_column_name(self.name): self}
return {
clean_column_name(k): v for k, v in self.items() if not isinstance(k, int)
}
@property
def _info_axis(self):
return getattr(self, self._info_axis_name)
@property
def _stat_axis(self):
return getattr(self, self._stat_axis_name)
@property
def shape(self) -> Tuple[int, ...]:
"""
Return a tuple of axis dimensions
"""
return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS)
@property
def axes(self) -> List[Index]:
"""
Return index label(s) of the internal NDFrame
"""
# we do it this way because if we have reversed axes, then
# the block manager shows then reversed
return [self._get_axis(a) for a in self._AXIS_ORDERS]
@property
def ndim(self) -> int:
"""
Return an int representing the number of axes / array dimensions.
Return 1 if Series. Otherwise return 2 if DataFrame.
See Also
--------
ndarray.ndim : Number of array dimensions.
Examples
--------
>>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
>>> s.ndim
1
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.ndim
2
"""
return self._data.ndim
@property
def size(self):
"""
Return an int representing the number of elements in this object.
Return the number of rows if Series. Otherwise return the number of
rows times number of columns if DataFrame.
See Also
--------
ndarray.size : Number of elements in the array.
Examples
--------
>>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
>>> s.size
3
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.size
4
"""
return np.prod(self.shape)
@property
def _selected_obj(self: FrameOrSeries) -> FrameOrSeries:
""" internal compat with SelectionMixin """
return self
@property
def _obj_with_exclusions(self: FrameOrSeries) -> FrameOrSeries:
""" internal compat with SelectionMixin """
return self
def set_axis(self, labels, axis=0, inplace=False):
"""
Assign desired index to given axis.
Indexes for column or row labels can be changed by assigning
a list-like or Index.
.. versionchanged:: 0.21.0
The signature is now `labels` and `axis`, consistent with
the rest of pandas API. Previously, the `axis` and `labels`
arguments were respectively the first and second positional
arguments.
Parameters
----------
labels : list-like, Index
The values for the new index.
axis : {0 or 'index', 1 or 'columns'}, default 0
The axis to update. The value 0 identifies the rows, and 1
identifies the columns.
inplace : bool, default False
Whether to return a new %(klass)s instance.
Returns
-------
renamed : %(klass)s or None
An object of same type as caller if inplace=False, None otherwise.
See Also
--------
DataFrame.rename_axis : Alter the name of the index or columns.
Examples
--------
**Series**
>>> s = pd.Series([1, 2, 3])
>>> s
0 1
1 2
2 3
dtype: int64
>>> s.set_axis(['a', 'b', 'c'], axis=0)
a 1
b 2
c 3
dtype: int64
**DataFrame**
>>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
Change the row labels.
>>> df.set_axis(['a', 'b', 'c'], axis='index')
A B
a 1 4
b 2 5
c 3 6
Change the column labels.
>>> df.set_axis(['I', 'II'], axis='columns')
I II
0 1 4
1 2 5
2 3 6
Now, update the labels inplace.
>>> df.set_axis(['i', 'ii'], axis='columns', inplace=True)
>>> df
i ii
0 1 4
1 2 5
2 3 6
"""
if inplace:
setattr(self, self._get_axis_name(axis), labels)
else:
obj = self.copy()
obj.set_axis(labels, axis=axis, inplace=True)
return obj
def _set_axis(self, axis, labels) -> None:
self._data.set_axis(axis, labels)
self._clear_item_cache()
def swapaxes(self: FrameOrSeries, axis1, axis2, copy=True) -> FrameOrSeries:
"""
Interchange axes and swap values axes appropriately.
Returns
-------
y : same as input
"""
i = self._get_axis_number(axis1)
j = self._get_axis_number(axis2)
if i == j:
if copy:
return self.copy()
return self
mapping = {i: j, j: i}
new_axes = (self._get_axis(mapping.get(k, k)) for k in range(self._AXIS_LEN))
new_values = self.values.swapaxes(i, j)
if copy:
new_values = new_values.copy()
return self._constructor(new_values, *new_axes).__finalize__(self)
def droplevel(self: FrameOrSeries, level, axis=0) -> FrameOrSeries:
"""
Return DataFrame with requested index / column level(s) removed.
.. versionadded:: 0.24.0
Parameters
----------
level : int, str, or list-like
If a string is given, must be the name of a level
If list-like, elements must be names or positional indexes
of levels.
axis : {0 or 'index', 1 or 'columns'}, default 0
Returns
-------
DataFrame
DataFrame with requested index / column level(s) removed.
Examples
--------
>>> df = pd.DataFrame([
... [1, 2, 3, 4],
... [5, 6, 7, 8],
... [9, 10, 11, 12]
... ]).set_index([0, 1]).rename_axis(['a', 'b'])
>>> df.columns = pd.MultiIndex.from_tuples([
... ('c', 'e'), ('d', 'f')
... ], names=['level_1', 'level_2'])
>>> df
level_1 c d
level_2 e f
a b
1 2 3 4
5 6 7 8
9 10 11 12
>>> df.droplevel('a')
level_1 c d
level_2 e f
b
2 3 4
6 7 8
10 11 12
>>> df.droplevel('level2', axis=1)
level_1 c d
a b
1 2 3 4
5 6 7 8
9 10 11 12
"""
labels = self._get_axis(axis)
new_labels = labels.droplevel(level)
result = self.set_axis(new_labels, axis=axis, inplace=False)
return result
def pop(self: FrameOrSeries, item) -> FrameOrSeries:
"""
Return item and drop from frame. Raise KeyError if not found.
Parameters
----------
item : str
Label of column to be popped.
Returns
-------
Series
Examples
--------
>>> df = pd.DataFrame([('falcon', 'bird', 389.0),
... ('parrot', 'bird', 24.0),
... ('lion', 'mammal', 80.5),
... ('monkey', 'mammal', np.nan)],
... columns=('name', 'class', 'max_speed'))
>>> df
name class max_speed
0 falcon bird 389.0
1 parrot bird 24.0
2 lion mammal 80.5
3 monkey mammal NaN
>>> df.pop('class')
0 bird
1 bird
2 mammal
3 mammal
Name: class, dtype: object
>>> df
name max_speed
0 falcon 389.0
1 parrot 24.0
2 lion 80.5
3 monkey NaN
"""
result = self[item]
del self[item]
try:
result._reset_cacher()
except AttributeError:
pass
return result
def squeeze(self, axis=None):
"""
Squeeze 1 dimensional axis objects into scalars.
Series or DataFrames with a single element are squeezed to a scalar.
DataFrames with a single column or a single row are squeezed to a
Series. Otherwise the object is unchanged.
This method is most useful when you don't know if your
object is a Series or DataFrame, but you do know it has just a single
column. In that case you can safely call `squeeze` to ensure you have a
Series.
Parameters
----------
axis : {0 or 'index', 1 or 'columns', None}, default None
A specific axis to squeeze. By default, all length-1 axes are
squeezed.
Returns
-------
DataFrame, Series, or scalar
The projection after squeezing `axis` or all the axes.
See Also
--------
Series.iloc : Integer-location based indexing for selecting scalars.
DataFrame.iloc : Integer-location based indexing for selecting Series.
Series.to_frame : Inverse of DataFrame.squeeze for a
single-column DataFrame.
Examples
--------
>>> primes = pd.Series([2, 3, 5, 7])
Slicing might produce a Series with a single value:
>>> even_primes = primes[primes % 2 == 0]
>>> even_primes
0 2
dtype: int64
>>> even_primes.squeeze()
2
Squeezing objects with more than one value in every axis does nothing:
>>> odd_primes = primes[primes % 2 == 1]
>>> odd_primes
1 3
2 5
3 7
dtype: int64
>>> odd_primes.squeeze()
1 3
2 5
3 7
dtype: int64
Squeezing is even more effective when used with DataFrames.
>>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b'])
>>> df
a b
0 1 2
1 3 4
Slicing a single column will produce a DataFrame with the columns
having only one value:
>>> df_a = df[['a']]
>>> df_a
a
0 1
1 3
So the columns can be squeezed down, resulting in a Series:
>>> df_a.squeeze('columns')
0 1
1 3
Name: a, dtype: int64
Slicing a single row from a single column will produce a single
scalar DataFrame:
>>> df_0a = df.loc[df.index < 1, ['a']]
>>> df_0a
a
0 1
Squeezing the rows produces a single scalar Series:
>>> df_0a.squeeze('rows')
a 1
Name: 0, dtype: int64
Squeezing all axes will project directly into a scalar:
>>> df_0a.squeeze()
1
"""
axis = self._AXIS_NAMES if axis is None else (self._get_axis_number(axis),)
return self.iloc[
tuple(
0 if i in axis and len(a) == 1 else slice(None)
for i, a in enumerate(self.axes)
)
]
def swaplevel(self: FrameOrSeries, i=-2, j=-1, axis=0) -> FrameOrSeries:
"""
Swap levels i and j in a MultiIndex on a particular axis
Parameters
----------
i, j : int, str (can be mixed)
Level of index to be swapped. Can pass level name as string.
Returns
-------
swapped : same type as caller (new object)
"""
axis = self._get_axis_number(axis)
result = self.copy()
labels = result._data.axes[axis]
result._data.set_axis(axis, labels.swaplevel(i, j))
return result
# ----------------------------------------------------------------------
# Rename
def rename(
self: FrameOrSeries,
mapper: Optional[Renamer] = None,
*,
index: Optional[Renamer] = None,
columns: Optional[Renamer] = None,
axis: Optional[Axis] = None,
copy: bool = True,
inplace: bool = False,
level: Optional[Level] = None,
errors: str = "ignore",
) -> Optional[FrameOrSeries]:
"""
Alter axes input function or functions. Function / dict values must be
unique (1-to-1). Labels not contained in a dict / Series will be left
as-is. Extra labels listed don't throw an error. Alternatively, change
``Series.name`` with a scalar value (Series only).
Parameters
----------
%(axes)s : scalar, list-like, dict-like or function, optional
Scalar or list-like will alter the ``Series.name`` attribute,
and raise on DataFrame.
dict-like or functions are transformations to apply to
that axis' values
copy : bool, default True
Also copy underlying data.
inplace : bool, default False
Whether to return a new %(klass)s. If True then value of copy is
ignored.
level : int or level name, default None
In case of a MultiIndex, only rename labels in the specified
level.
errors : {'ignore', 'raise'}, default 'ignore'
If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`,
or `columns` contains labels that are not present in the Index
being transformed.
If 'ignore', existing keys will be renamed and extra keys will be
ignored.
Returns
-------
renamed : %(klass)s (new object)
Raises
------
KeyError
If any of the labels is not found in the selected axis and
"errors='raise'".
See Also
--------
NDFrame.rename_axis
Examples
--------
>>> s = pd.Series([1, 2, 3])
>>> s
0 1
1 2
2 3
dtype: int64
>>> s.rename("my_name") # scalar, changes Series.name
0 1
1 2
2 3
Name: my_name, dtype: int64
>>> s.rename(lambda x: x ** 2) # function, changes labels
0 1
1 2
4 3
dtype: int64
>>> s.rename({1: 3, 2: 5}) # mapping, changes labels
0 1
3 2
5 3
dtype: int64
Since ``DataFrame`` doesn't have a ``.name`` attribute,
only mapping-type arguments are allowed.
>>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
>>> df.rename(2)
Traceback (most recent call last):
...
TypeError: 'int' object is not callable
``DataFrame.rename`` supports two calling conventions
* ``(index=index_mapper, columns=columns_mapper, ...)``
* ``(mapper, axis={'index', 'columns'}, ...)``
We *highly* recommend using keyword arguments to clarify your
intent.
>>> df.rename(index=str, columns={"A": "a", "B": "c"})
a c
0 1 4
1 2 5
2 3 6
>>> df.rename(index=str, columns={"A": "a", "C": "c"})
a B
0 1 4
1 2 5
2 3 6
Using axis-style parameters
>>> df.rename(str.lower, axis='columns')
a b
0 1 4
1 2 5
2 3 6
>>> df.rename({1: 2, 2: 4}, axis='index')
A B
0 1 4
2 2 5
4 3 6
See the :ref:`user guide <basics.rename>` for more.
"""
if mapper is None and index is None and columns is None:
raise TypeError("must pass an index to rename")
if index is not None or columns is not None:
if axis is not None:
raise TypeError(
"Cannot specify both 'axis' and any of 'index' or 'columns'"
)
elif mapper is not None:
raise TypeError(
"Cannot specify both 'mapper' and any of 'index' or 'columns'"
)
else:
# use the mapper argument
if axis and self._get_axis_number(axis) == 1:
columns = mapper
else:
index = mapper
result = self if inplace else self.copy(deep=copy)
for axis_no, replacements in enumerate((index, columns)):
if replacements is None:
continue
ax = self._get_axis(axis_no)
baxis = self._get_block_manager_axis(axis_no)
f = com.get_rename_function(replacements)
if level is not None:
level = ax._get_level_number(level)
# GH 13473
if not callable(replacements):
indexer = ax.get_indexer_for(replacements)
if errors == "raise" and len(indexer[indexer == -1]):
missing_labels = [
label
for index, label in enumerate(replacements)
if indexer[index] == -1
]
raise KeyError(f"{missing_labels} not found in axis")
result._data = result._data.rename_axis(
f, axis=baxis, copy=copy, level=level
)
result._clear_item_cache()
if inplace:
self._update_inplace(result._data)
return None
else:
return result.__finalize__(self)
@rewrite_axis_style_signature("mapper", [("copy", True), ("inplace", False)])
def rename_axis(self, mapper=lib.no_default, **kwargs):
"""
Set the name of the axis for the index or columns.
Parameters
----------
mapper : scalar, list-like, optional
Value to set the axis name attribute.
index, columns : scalar, list-like, dict-like or function, optional
A scalar, list-like, dict-like or functions transformations to
apply to that axis' values.
Use either ``mapper`` and ``axis`` to
specify the axis to target with ``mapper``, or ``index``
and/or ``columns``.
.. versionchanged:: 0.24.0
axis : {0 or 'index', 1 or 'columns'}, default 0
The axis to rename.
copy : bool, default True
Also copy underlying data.
inplace : bool, default False
Modifies the object directly, instead of creating a new Series
or DataFrame.
Returns
-------
Series, DataFrame, or None
The same type as the caller or None if `inplace` is True.
See Also
--------
Series.rename : Alter Series index labels or name.
DataFrame.rename : Alter DataFrame index labels or name.
Index.rename : Set new names on index.
Notes
-----
``DataFrame.rename_axis`` supports two calling conventions
* ``(index=index_mapper, columns=columns_mapper, ...)``
* ``(mapper, axis={'index', 'columns'}, ...)``
The first calling convention will only modify the names of
the index and/or the names of the Index object that is the columns.
In this case, the parameter ``copy`` is ignored.
The second calling convention will modify the names of the
the corresponding index if mapper is a list or a scalar.
However, if mapper is dict-like or a function, it will use the
deprecated behavior of modifying the axis *labels*.
We *highly* recommend using keyword arguments to clarify your
intent.
Examples
--------
**Series**
>>> s = pd.Series(["dog", "cat", "monkey"])
>>> s
0 dog
1 cat
2 monkey
dtype: object
>>> s.rename_axis("animal")
animal
0 dog
1 cat
2 monkey
dtype: object
**DataFrame**
>>> df = pd.DataFrame({"num_legs": [4, 4, 2],
... "num_arms": [0, 0, 2]},
... ["dog", "cat", "monkey"])
>>> df
num_legs num_arms
dog 4 0
cat 4 0
monkey 2 2
>>> df = df.rename_axis("animal")
>>> df
num_legs num_arms
animal
dog 4 0
cat 4 0
monkey 2 2
>>> df = df.rename_axis("limbs", axis="columns")
>>> df
limbs num_legs num_arms
animal
dog 4 0
cat 4 0
monkey 2 2
**MultiIndex**
>>> df.index = pd.MultiIndex.from_product([['mammal'],
... ['dog', 'cat', 'monkey']],
... names=['type', 'name'])
>>> df
limbs num_legs num_arms
type name
mammal dog 4 0
cat 4 0
monkey 2 2
>>> df.rename_axis(index={'type': 'class'})
limbs num_legs num_arms
class name
mammal dog 4 0
cat 4 0
monkey 2 2
>>> df.rename_axis(columns=str.upper)
LIMBS num_legs num_arms
type name
mammal dog 4 0
cat 4 0
monkey 2 2
"""
axes, kwargs = self._construct_axes_from_arguments(
(), kwargs, sentinel=lib.no_default
)
copy = kwargs.pop("copy", True)
inplace = kwargs.pop("inplace", False)
axis = kwargs.pop("axis", 0)
if axis is not None:
axis = self._get_axis_number(axis)
if kwargs:
raise TypeError(
"rename_axis() got an unexpected keyword "
f'argument "{list(kwargs.keys())[0]}"'
)
inplace = validate_bool_kwarg(inplace, "inplace")
if mapper is not lib.no_default:
# Use v0.23 behavior if a scalar or list
non_mapper = is_scalar(mapper) or (
is_list_like(mapper) and not is_dict_like(mapper)
)
if non_mapper:
return self._set_axis_name(mapper, axis=axis, inplace=inplace)
else:
raise ValueError("Use `.rename` to alter labels with a mapper.")
else:
# Use new behavior. Means that index and/or columns
# is specified
result = self if inplace else self.copy(deep=copy)
for axis in range(self._AXIS_LEN):
v = axes.get(self._AXIS_NAMES[axis])
if v is lib.no_default:
continue
non_mapper = is_scalar(v) or (is_list_like(v) and not is_dict_like(v))
if non_mapper:
newnames = v
else:
f = com.get_rename_function(v)
curnames = self._get_axis(axis).names
newnames = [f(name) for name in curnames]
result._set_axis_name(newnames, axis=axis, inplace=True)
if not inplace:
return result
def _set_axis_name(self, name, axis=0, inplace=False):
"""
Set the name(s) of the axis.
Parameters
----------
name : str or list of str
Name(s) to set.
axis : {0 or 'index', 1 or 'columns'}, default 0
The axis to set the label. The value 0 or 'index' specifies index,
and the value 1 or 'columns' specifies columns.
inplace : bool, default False
If `True`, do operation inplace and return None.
.. versionadded:: 0.21.0
Returns
-------
Series, DataFrame, or None
The same type as the caller or `None` if `inplace` is `True`.
See Also
--------
DataFrame.rename : Alter the axis labels of :class:`DataFrame`.
Series.rename : Alter the index labels or set the index name
of :class:`Series`.
Index.rename : Set the name of :class:`Index` or :class:`MultiIndex`.
Examples
--------
>>> df = pd.DataFrame({"num_legs": [4, 4, 2]},
... ["dog", "cat", "monkey"])
>>> df
num_legs
dog 4
cat 4
monkey 2
>>> df._set_axis_name("animal")
num_legs
animal
dog 4
cat 4
monkey 2
>>> df.index = pd.MultiIndex.from_product(
... [["mammal"], ['dog', 'cat', 'monkey']])
>>> df._set_axis_name(["type", "name"])
legs
type name
mammal dog 4
cat 4
monkey 2
"""
axis = self._get_axis_number(axis)
idx = self._get_axis(axis).set_names(name)
inplace = validate_bool_kwarg(inplace, "inplace")
renamed = self if inplace else self.copy()
renamed.set_axis(idx, axis=axis, inplace=True)
if not inplace:
return renamed
# ----------------------------------------------------------------------
# Comparison Methods
def _indexed_same(self, other) -> bool:
return all(
self._get_axis(a).equals(other._get_axis(a)) for a in self._AXIS_ORDERS
)
def equals(self, other):
"""
Test whether two objects contain the same elements.
This function allows two Series or DataFrames to be compared against
each other to see if they have the same shape and elements. NaNs in
the same location are considered equal. The column headers do not
need to have the same type, but the elements within the columns must
be the same dtype.
Parameters
----------
other : Series or DataFrame
The other Series or DataFrame to be compared with the first.
Returns
-------
bool
True if all elements are the same in both objects, False
otherwise.
See Also
--------
Series.eq : Compare two Series objects of the same length
and return a Series where each element is True if the element
in each Series is equal, False otherwise.
DataFrame.eq : Compare two DataFrame objects of the same shape and
return a DataFrame where each element is True if the respective
element in each DataFrame is equal, False otherwise.
testing.assert_series_equal : Raises an AssertionError if left and
right are not equal. Provides an easy interface to ignore
inequality in dtypes, indexes and precision among others.
testing.assert_frame_equal : Like assert_series_equal, but targets
DataFrames.
numpy.array_equal : Return True if two arrays have the same shape
and elements, False otherwise.
Notes
-----
This function requires that the elements have the same dtype as their
respective elements in the other Series or DataFrame. However, the
column labels do not need to have the same type, as long as they are
still considered equal.
Examples
--------
>>> df = pd.DataFrame({1: [10], 2: [20]})
>>> df
1 2
0 10 20
DataFrames df and exactly_equal have the same types and values for
their elements and column labels, which will return True.
>>> exactly_equal = pd.DataFrame({1: [10], 2: [20]})
>>> exactly_equal
1 2
0 10 20
>>> df.equals(exactly_equal)
True
DataFrames df and different_column_type have the same element
types and values, but have different types for the column labels,
which will still return True.
>>> different_column_type = pd.DataFrame({1.0: [10], 2.0: [20]})
>>> different_column_type
1.0 2.0
0 10 20
>>> df.equals(different_column_type)
True
DataFrames df and different_data_type have different types for the
same values for their elements, and will return False even though
their column labels are the same values and types.
>>> different_data_type = pd.DataFrame({1: [10.0], 2: [20.0]})
>>> different_data_type
1 2
0 10.0 20.0
>>> df.equals(different_data_type)
False
"""
if not isinstance(other, self._constructor):
return False
return self._data.equals(other._data)
# -------------------------------------------------------------------------
# Unary Methods
def __neg__(self):
values = com.values_from_object(self)
if is_bool_dtype(values):
arr = operator.inv(values)
elif (
is_numeric_dtype(values)
or is_timedelta64_dtype(values)
or is_object_dtype(values)
):
arr = operator.neg(values)
else:
raise TypeError(f"Unary negative expects numeric dtype, not {values.dtype}")
return self.__array_wrap__(arr)
def __pos__(self):
values = com.values_from_object(self)
if is_bool_dtype(values) or is_period_arraylike(values):
arr = values
elif (
is_numeric_dtype(values)
or is_timedelta64_dtype(values)
or is_object_dtype(values)
):
arr = operator.pos(values)
else:
raise TypeError(f"Unary plus expects numeric dtype, not {values.dtype}")
return self.__array_wrap__(arr)
def __invert__(self):
if not self.size:
# inv fails with 0 len
return self
new_data = self._data.apply(operator.invert)
result = self._constructor(new_data).__finalize__(self)
return result
def __nonzero__(self):
raise ValueError(
f"The truth value of a {type(self).__name__} is ambiguous. "
"Use a.empty, a.bool(), a.item(), a.any() or a.all()."
)
__bool__ = __nonzero__
def bool(self):
"""
Return the bool of a single element PandasObject.
This must be a boolean scalar value, either True or False. Raise a
ValueError if the PandasObject does not have exactly 1 element, or that
element is not boolean
Returns
-------
bool
Same single boolean value converted to bool type.
"""
v = self.squeeze()
if isinstance(v, (bool, np.bool_)):
return bool(v)
elif is_scalar(v):
raise ValueError(
"bool cannot act on a non-boolean single element "
f"{type(self).__name__}"
)
self.__nonzero__()
def __abs__(self: FrameOrSeries) -> FrameOrSeries:
return self.abs()
def __round__(self: FrameOrSeries, decimals: int = 0) -> FrameOrSeries:
return self.round(decimals)
# -------------------------------------------------------------------------
# Label or Level Combination Helpers
#
# A collection of helper methods for DataFrame/Series operations that
# accept a combination of column/index labels and levels. All such
# operations should utilize/extend these methods when possible so that we
# have consistent precedence and validation logic throughout the library.
def _is_level_reference(self, key, axis=0):
"""
Test whether a key is a level reference for a given axis.
To be considered a level reference, `key` must be a string that:
- (axis=0): Matches the name of an index level and does NOT match
a column label.
- (axis=1): Matches the name of a column level and does NOT match
an index label.
Parameters
----------
key : str
Potential level name for the given axis
axis : int, default 0
Axis that levels are associated with (0 for index, 1 for columns)
Returns
-------
is_level : bool
"""
axis = self._get_axis_number(axis)
return (
key is not None
and is_hashable(key)
and key in self.axes[axis].names
and not self._is_label_reference(key, axis=axis)
)
def _is_label_reference(self, key, axis=0) -> bool_t:
"""
Test whether a key is a label reference for a given axis.
To be considered a label reference, `key` must be a string that:
- (axis=0): Matches a column label
- (axis=1): Matches an index label
Parameters
----------
key: str
Potential label name
axis: int, default 0
Axis perpendicular to the axis that labels are associated with
(0 means search for column labels, 1 means search for index labels)
Returns
-------
is_label: bool
"""
axis = self._get_axis_number(axis)
other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis)
return (
key is not None
and is_hashable(key)
and any(key in self.axes[ax] for ax in other_axes)
)
def _is_label_or_level_reference(self, key: str, axis: int = 0) -> bool_t:
"""
Test whether a key is a label or level reference for a given axis.
To be considered either a label or a level reference, `key` must be a
string that:
- (axis=0): Matches a column label or an index level
- (axis=1): Matches an index label or a column level
Parameters
----------
key: str
Potential label or level name
axis: int, default 0
Axis that levels are associated with (0 for index, 1 for columns)
Returns
-------
is_label_or_level: bool
"""
return self._is_level_reference(key, axis=axis) or self._is_label_reference(
key, axis=axis
)
def _check_label_or_level_ambiguity(self, key, axis: int = 0) -> None:
"""
Check whether `key` is ambiguous.
By ambiguous, we mean that it matches both a level of the input
`axis` and a label of the other axis.
Parameters
----------
key: str or object
Label or level name.
axis: int, default 0
Axis that levels are associated with (0 for index, 1 for columns).
Raises
------
ValueError: `key` is ambiguous
"""
axis = self._get_axis_number(axis)
other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis)
if (
key is not None
and is_hashable(key)
and key in self.axes[axis].names
and any(key in self.axes[ax] for ax in other_axes)
):
# Build an informative and grammatical warning
level_article, level_type = (
("an", "index") if axis == 0 else ("a", "column")
)
label_article, label_type = (
("a", "column") if axis == 0 else ("an", "index")
)
msg = (
f"'{key}' is both {level_article} {level_type} level and "
f"{label_article} {label_type} label, which is ambiguous."
)
raise ValueError(msg)
def _get_label_or_level_values(self, key: str, axis: int = 0) -> np.ndarray:
"""
Return a 1-D array of values associated with `key`, a label or level
from the given `axis`.
Retrieval logic:
- (axis=0): Return column values if `key` matches a column label.
Otherwise return index level values if `key` matches an index
level.
- (axis=1): Return row values if `key` matches an index label.
Otherwise return column level values if 'key' matches a column
level
Parameters
----------
key: str
Label or level name.
axis: int, default 0
Axis that levels are associated with (0 for index, 1 for columns)
Returns
-------
values: np.ndarray
Raises
------
KeyError
if `key` matches neither a label nor a level
ValueError
if `key` matches multiple labels
FutureWarning
if `key` is ambiguous. This will become an ambiguity error in a
future version
"""
axis = self._get_axis_number(axis)
other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis]
if self._is_label_reference(key, axis=axis):
self._check_label_or_level_ambiguity(key, axis=axis)
values = self.xs(key, axis=other_axes[0])._values
elif self._is_level_reference(key, axis=axis):
values = self.axes[axis].get_level_values(key)._values
else:
raise KeyError(key)
# Check for duplicates
if values.ndim > 1:
if other_axes and isinstance(self._get_axis(other_axes[0]), MultiIndex):
multi_message = (
"\n"
"For a multi-index, the label must be a "
"tuple with elements corresponding to "
"each level."
)
else:
multi_message = ""
label_axis_name = "column" if axis == 0 else "index"
raise ValueError(
(
f"The {label_axis_name} label '{key}' "
f"is not unique.{multi_message}"
)
)
return values
def _drop_labels_or_levels(self, keys, axis: int = 0):
"""
Drop labels and/or levels for the given `axis`.
For each key in `keys`:
- (axis=0): If key matches a column label then drop the column.
Otherwise if key matches an index level then drop the level.
- (axis=1): If key matches an index label then drop the row.
Otherwise if key matches a column level then drop the level.
Parameters
----------
keys: str or list of str
labels or levels to drop
axis: int, default 0
Axis that levels are associated with (0 for index, 1 for columns)
Returns
-------
dropped: DataFrame
Raises
------
ValueError
if any `keys` match neither a label nor a level
"""
axis = self._get_axis_number(axis)
# Validate keys
keys = com.maybe_make_list(keys)
invalid_keys = [
k for k in keys if not self._is_label_or_level_reference(k, axis=axis)
]
if invalid_keys:
raise ValueError(
(
"The following keys are not valid labels or "
f"levels for axis {axis}: {invalid_keys}"
)
)
# Compute levels and labels to drop
levels_to_drop = [k for k in keys if self._is_level_reference(k, axis=axis)]
labels_to_drop = [k for k in keys if not self._is_level_reference(k, axis=axis)]
# Perform copy upfront and then use inplace operations below.
# This ensures that we always perform exactly one copy.
# ``copy`` and/or ``inplace`` options could be added in the future.
dropped = self.copy()
if axis == 0:
# Handle dropping index levels
if levels_to_drop:
dropped.reset_index(levels_to_drop, drop=True, inplace=True)
# Handle dropping columns labels
if labels_to_drop:
dropped.drop(labels_to_drop, axis=1, inplace=True)
else:
# Handle dropping column levels
if levels_to_drop:
if isinstance(dropped.columns, MultiIndex):
# Drop the specified levels from the MultiIndex
dropped.columns = dropped.columns.droplevel(levels_to_drop)
else:
# Drop the last level of Index by replacing with
# a RangeIndex
dropped.columns = RangeIndex(dropped.columns.size)
# Handle dropping index labels
if labels_to_drop:
dropped.drop(labels_to_drop, axis=0, inplace=True)
return dropped
# ----------------------------------------------------------------------
# Iteration
def __hash__(self):
raise TypeError(
f"{repr(type(self).__name__)} objects are mutable, "
f"thus they cannot be hashed"
)
def __iter__(self):
"""
Iterate over info axis.
Returns
-------
iterator
Info axis as iterator.
"""
return iter(self._info_axis)
# can we get a better explanation of this?
def keys(self):
"""
Get the 'info axis' (see Indexing for more).
This is index for Series, columns for DataFrame.
Returns
-------
Index
Info axis.
"""
return self._info_axis
def items(self):
"""Iterate over (label, values) on info axis
This is index for Series and columns for DataFrame.
Returns
-------
Generator
"""
for h in self._info_axis:
yield h, self[h]
@Appender(items.__doc__)
def iteritems(self):
return self.items()
def __len__(self) -> int:
"""Returns length of info axis"""
return len(self._info_axis)
def __contains__(self, key) -> bool_t:
"""True if the key is in the info axis"""
return key in self._info_axis
@property
def empty(self) -> bool_t:
"""
Indicator whether DataFrame is empty.
True if DataFrame is entirely empty (no items), meaning any of the
axes are of length 0.
Returns
-------
bool
If DataFrame is empty, return True, if not return False.
See Also
--------
Series.dropna
DataFrame.dropna
Notes
-----
If DataFrame contains only NaNs, it is still not considered empty. See
the example below.
Examples
--------
An example of an actual empty DataFrame. Notice the index is empty:
>>> df_empty = pd.DataFrame({'A' : []})
>>> df_empty
Empty DataFrame
Columns: [A]
Index: []
>>> df_empty.empty
True
If we only have NaNs in our DataFrame, it is not considered empty! We
will need to drop the NaNs to make the DataFrame empty:
>>> df = pd.DataFrame({'A' : [np.nan]})
>>> df
A
0 NaN
>>> df.empty
False
>>> df.dropna().empty
True
"""
return any(len(self._get_axis(a)) == 0 for a in self._AXIS_ORDERS)
# ----------------------------------------------------------------------
# Array Interface
# This is also set in IndexOpsMixin
# GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented
__array_priority__ = 1000
def __array__(self, dtype=None) -> np.ndarray:
return com.values_from_object(self)
def __array_wrap__(self, result, context=None):
result = lib.item_from_zerodim(result)
if is_scalar(result):
# e.g. we get here with np.ptp(series)
# ptp also requires the item_from_zerodim
return result
d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False)
return self._constructor(result, **d).__finalize__(self)
# ideally we would define this to avoid the getattr checks, but
# is slower
# @property
# def __array_interface__(self):
# """ provide numpy array interface method """
# values = self.values
# return dict(typestr=values.dtype.str,shape=values.shape,data=values)
# ----------------------------------------------------------------------
# Picklability
def __getstate__(self) -> Dict[str, Any]:
meta = {k: getattr(self, k, None) for k in self._metadata}
return dict(
_data=self._data,
_typ=self._typ,
_metadata=self._metadata,
attrs=self.attrs,
**meta,
)
def __setstate__(self, state):
if isinstance(state, BlockManager):
self._data = state
elif isinstance(state, dict):
typ = state.get("_typ")
if typ is not None:
attrs = state.get("_attrs", {})
object.__setattr__(self, "_attrs", attrs)
# set in the order of internal names
# to avoid definitional recursion
# e.g. say fill_value needing _data to be
# defined
meta = set(self._internal_names + self._metadata)
for k in list(meta):
if k in state:
v = state[k]
object.__setattr__(self, k, v)
for k, v in state.items():
if k not in meta:
object.__setattr__(self, k, v)
else:
self._unpickle_series_compat(state)
elif len(state) == 2:
self._unpickle_series_compat(state)
self._item_cache = {}
# ----------------------------------------------------------------------
# Rendering Methods
def __repr__(self) -> str:
# string representation based upon iterating over self
# (since, by definition, `PandasContainers` are iterable)
prepr = f"[{','.join(map(pprint_thing, self))}]"
return f"{type(self).__name__}({prepr})"
def _repr_latex_(self):
"""
Returns a LaTeX representation for a particular object.
Mainly for use with nbconvert (jupyter notebook conversion to pdf).
"""
if config.get_option("display.latex.repr"):
return self.to_latex()
else:
return None
def _repr_data_resource_(self):
"""
Not a real Jupyter special repr method, but we use the same
naming convention.
"""
if config.get_option("display.html.table_schema"):
data = self.head(config.get_option("display.max_rows"))
payload = json.loads(
data.to_json(orient="table"), object_pairs_hook=collections.OrderedDict
)
return payload
# ----------------------------------------------------------------------
# I/O Methods
_shared_docs[
"to_markdown"
] = """
Print %(klass)s in Markdown-friendly format.
.. versionadded:: 1.0.0
Parameters
----------
buf : writable buffer, defaults to sys.stdout
Where to send the output. By default, the output is printed to
sys.stdout. Pass a writable buffer if you need to further process
the output.
mode : str, optional
Mode in which file is opened.
**kwargs
These parameters will be passed to `tabulate`.
Returns
-------
str
%(klass)s in Markdown-friendly format.
"""
_shared_docs[
"to_excel"
] = """
Write %(klass)s to an Excel sheet.
To write a single %(klass)s to an Excel .xlsx file it is only necessary to
specify a target file name. To write to multiple sheets it is necessary to
create an `ExcelWriter` object with a target file name, and specify a sheet
in the file to write to.
Multiple sheets may be written to by specifying unique `sheet_name`.
With all data written to the file it is necessary to save the changes.
Note that creating an `ExcelWriter` object with a file name that already
exists will result in the contents of the existing file being erased.
Parameters
----------
excel_writer : str or ExcelWriter object
File path or existing ExcelWriter.
sheet_name : str, default 'Sheet1'
Name of sheet which will contain DataFrame.
na_rep : str, default ''
Missing data representation.
float_format : str, optional
Format string for floating point numbers. For example
``float_format="%%.2f"`` will format 0.1234 to 0.12.
columns : sequence or list of str, optional
Columns to write.
header : bool or list of str, default True
Write out the column names. If a list of string is given it is
assumed to be aliases for the column names.
index : bool, default True
Write row names (index).
index_label : str or sequence, optional
Column label for index column(s) if desired. If not specified, and
`header` and `index` are True, then the index names are used. A
sequence should be given if the DataFrame uses MultiIndex.
startrow : int, default 0
Upper left cell row to dump data frame.
startcol : int, default 0
Upper left cell column to dump data frame.
engine : str, optional
Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this
via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and
``io.excel.xlsm.writer``.
merge_cells : bool, default True
Write MultiIndex and Hierarchical Rows as merged cells.
encoding : str, optional
Encoding of the resulting excel file. Only necessary for xlwt,
other writers support unicode natively.
inf_rep : str, default 'inf'
Representation for infinity (there is no native representation for
infinity in Excel).
verbose : bool, default True
Display more information in the error logs.
freeze_panes : tuple of int (length 2), optional
Specifies the one-based bottommost row and rightmost column that
is to be frozen.
See Also
--------
to_csv : Write DataFrame to a comma-separated values (csv) file.
ExcelWriter : Class for writing DataFrame objects into excel sheets.
read_excel : Read an Excel file into a pandas DataFrame.
read_csv : Read a comma-separated values (csv) file into DataFrame.
Notes
-----
For compatibility with :meth:`~DataFrame.to_csv`,
to_excel serializes lists and dicts to strings before writing.
Once a workbook has been saved it is not possible write further data
without rewriting the whole workbook.
Examples
--------
Create, write to and save a workbook:
>>> df1 = pd.DataFrame([['a', 'b'], ['c', 'd']],
... index=['row 1', 'row 2'],
... columns=['col 1', 'col 2'])
>>> df1.to_excel("output.xlsx") # doctest: +SKIP
To specify the sheet name:
>>> df1.to_excel("output.xlsx",
... sheet_name='Sheet_name_1') # doctest: +SKIP
If you wish to write to more than one sheet in the workbook, it is
necessary to specify an ExcelWriter object:
>>> df2 = df1.copy()
>>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP
... df1.to_excel(writer, sheet_name='Sheet_name_1')
... df2.to_excel(writer, sheet_name='Sheet_name_2')
ExcelWriter can also be used to append to an existing Excel file:
>>> with pd.ExcelWriter('output.xlsx',
... mode='a') as writer: # doctest: +SKIP
... df.to_excel(writer, sheet_name='Sheet_name_3')
To set the library that is used to write the Excel file,
you can pass the `engine` keyword (the default engine is
automatically chosen depending on the file extension):
>>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP
"""
@Appender(_shared_docs["to_excel"] % dict(klass="object"))
def to_excel(
self,
excel_writer,
sheet_name="Sheet1",
na_rep="",
float_format=None,
columns=None,
header=True,
index=True,
index_label=None,
startrow=0,
startcol=0,
engine=None,
merge_cells=True,
encoding=None,
inf_rep="inf",
verbose=True,
freeze_panes=None,
) -> None:
df = self if isinstance(self, ABCDataFrame) else self.to_frame()
from pandas.io.formats.excel import ExcelFormatter
formatter = ExcelFormatter(
df,
na_rep=na_rep,
cols=columns,
header=header,
float_format=float_format,
index=index,
index_label=index_label,
merge_cells=merge_cells,
inf_rep=inf_rep,
)
formatter.write(
excel_writer,
sheet_name=sheet_name,
startrow=startrow,
startcol=startcol,
freeze_panes=freeze_panes,
engine=engine,
)
def to_json(
self,
path_or_buf: Optional[FilePathOrBuffer] = None,
orient: Optional[str] = None,
date_format: Optional[str] = None,
double_precision: int = 10,
force_ascii: bool_t = True,
date_unit: str = "ms",
default_handler: Optional[Callable[[Any], JSONSerializable]] = None,
lines: bool_t = False,
compression: Optional[str] = "infer",
index: bool_t = True,
indent: Optional[int] = None,
) -> Optional[str]:
"""
Convert the object to a JSON string.
Note NaN's and None will be converted to null and datetime objects
will be converted to UNIX timestamps.
Parameters
----------
path_or_buf : str or file handle, optional
File path or object. If not specified, the result is returned as
a string.
orient : str
Indication of expected JSON string format.
* Series:
- default is 'index'
- allowed values are: {'split','records','index','table'}.
* DataFrame:
- default is 'columns'
- allowed values are: {'split', 'records', 'index', 'columns',
'values', 'table'}.
* The format of the JSON string:
- 'split' : dict like {'index' -> [index], 'columns' -> [columns],
'data' -> [values]}
- 'records' : list like [{column -> value}, ... , {column -> value}]
- 'index' : dict like {index -> {column -> value}}
- 'columns' : dict like {column -> {index -> value}}
- 'values' : just the values array
- 'table' : dict like {'schema': {schema}, 'data': {data}}
Describing the data, where data component is like ``orient='records'``.
.. versionchanged:: 0.20.0
date_format : {None, 'epoch', 'iso'}
Type of date conversion. 'epoch' = epoch milliseconds,
'iso' = ISO8601. The default depends on the `orient`. For
``orient='table'``, the default is 'iso'. For all other orients,
the default is 'epoch'.
double_precision : int, default 10
The number of decimal places to use when encoding
floating point values.
force_ascii : bool, default True
Force encoded string to be ASCII.
date_unit : str, default 'ms' (milliseconds)
The time unit to encode to, governs timestamp and ISO8601
precision. One of 's', 'ms', 'us', 'ns' for second, millisecond,
microsecond, and nanosecond respectively.
default_handler : callable, default None
Handler to call if object cannot otherwise be converted to a
suitable format for JSON. Should receive a single argument which is
the object to convert and return a serialisable object.
lines : bool, default False
If 'orient' is 'records' write out line delimited json format. Will
throw ValueError if incorrect 'orient' since others are not list
like.
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}
A string representing the compression to use in the output file,
only used when the first argument is a filename. By default, the
compression is inferred from the filename.
.. versionadded:: 0.21.0
.. versionchanged:: 0.24.0
'infer' option added and set to default
index : bool, default True
Whether to include the index values in the JSON string. Not
including the index (``index=False``) is only supported when
orient is 'split' or 'table'.
.. versionadded:: 0.23.0
indent : int, optional
Length of whitespace used to indent each record.
.. versionadded:: 1.0.0
Returns
-------
None or str
If path_or_buf is None, returns the resulting json format as a
string. Otherwise returns None.
See Also
--------
read_json
Notes
-----
The behavior of ``indent=0`` varies from the stdlib, which does not
indent the output but does insert newlines. Currently, ``indent=0``
and the default ``indent=None`` are equivalent in pandas, though this
may change in a future release.
Examples
--------
>>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],
... index=['row 1', 'row 2'],
... columns=['col 1', 'col 2'])
>>> df.to_json(orient='split')
'{"columns":["col 1","col 2"],
"index":["row 1","row 2"],
"data":[["a","b"],["c","d"]]}'
Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
Note that index labels are not preserved with this encoding.
>>> df.to_json(orient='records')
'[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'
Encoding/decoding a Dataframe using ``'index'`` formatted JSON:
>>> df.to_json(orient='index')
'{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}'
Encoding/decoding a Dataframe using ``'columns'`` formatted JSON:
>>> df.to_json(orient='columns')
'{"col 1":{"row 1":"a","row 2":"c"},"col 2":{"row 1":"b","row 2":"d"}}'
Encoding/decoding a Dataframe using ``'values'`` formatted JSON:
>>> df.to_json(orient='values')
'[["a","b"],["c","d"]]'
Encoding with Table Schema
>>> df.to_json(orient='table')
'{"schema": {"fields": [{"name": "index", "type": "string"},
{"name": "col 1", "type": "string"},
{"name": "col 2", "type": "string"}],
"primaryKey": "index",
"pandas_version": "0.20.0"},
"data": [{"index": "row 1", "col 1": "a", "col 2": "b"},
{"index": "row 2", "col 1": "c", "col 2": "d"}]}'
"""
from pandas.io import json
if date_format is None and orient == "table":
date_format = "iso"
elif date_format is None:
date_format = "epoch"
config.is_nonnegative_int(indent)
indent = indent or 0
return json.to_json(
path_or_buf=path_or_buf,
obj=self,
orient=orient,
date_format=date_format,
double_precision=double_precision,
force_ascii=force_ascii,
date_unit=date_unit,
default_handler=default_handler,
lines=lines,
compression=compression,
index=index,
indent=indent,
)
def to_hdf(
self,
path_or_buf,
key: str,
mode: str = "a",
complevel: Optional[int] = None,
complib: Optional[str] = None,
append: bool_t = False,
format: Optional[str] = None,
index: bool_t = True,
min_itemsize: Optional[Union[int, Dict[str, int]]] = None,
nan_rep=None,
dropna: Optional[bool_t] = None,
data_columns: Optional[List[str]] = None,
errors: str = "strict",
encoding: str = "UTF-8",
) -> None:
"""
Write the contained data to an HDF5 file using HDFStore.
Hierarchical Data Format (HDF) is self-describing, allowing an
application to interpret the structure and contents of a file with
no outside information. One HDF file can hold a mix of related objects
which can be accessed as a group or as individual objects.
In order to add another DataFrame or Series to an existing HDF file
please use append mode and a different a key.
For more information see the :ref:`user guide <io.hdf5>`.
Parameters
----------
path_or_buf : str or pandas.HDFStore
File path or HDFStore object.
key : str
Identifier for the group in the store.
mode : {'a', 'w', 'r+'}, default 'a'
Mode to open file:
- 'w': write, a new file is created (an existing file with
the same name would be deleted).
- 'a': append, an existing file is opened for reading and
writing, and if the file does not exist it is created.
- 'r+': similar to 'a', but the file must already exist.
complevel : {0-9}, optional
Specifies a compression level for data.
A value of 0 disables compression.
complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
Specifies the compression library to be used.
As of v0.20.2 these additional compressors for Blosc are supported
(default if no compressor specified: 'blosc:blosclz'):
{'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
'blosc:zlib', 'blosc:zstd'}.
Specifying a compression library which is not available issues
a ValueError.
append : bool, default False
For Table formats, append the input data to the existing.
format : {'fixed', 'table', None}, default 'fixed'
Possible values:
- 'fixed': Fixed format. Fast writing/reading. Not-appendable,
nor searchable.
- 'table': Table format. Write as a PyTables Table structure
which may perform worse but allow more flexible operations
like searching / selecting subsets of the data.
- If None, pd.get_option('io.hdf.default_format') is checked,
followed by fallback to "fixed"
errors : str, default 'strict'
Specifies how encoding and decoding errors are to be handled.
See the errors argument for :func:`open` for a full list
of options.
encoding : str, default "UTF-8"
min_itemsize : dict or int, optional
Map column names to minimum string sizes for columns.
nan_rep : Any, optional
How to represent null values as str.
Not allowed with append=True.
data_columns : list of columns or True, optional
List of columns to create as indexed data columns for on-disk
queries, or True to use all columns. By default only the axes
of the object are indexed. See :ref:`io.hdf5-query-data-columns`.
Applicable only to format='table'.
See Also
--------
DataFrame.read_hdf : Read from HDF file.
DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
DataFrame.to_sql : Write to a sql table.
DataFrame.to_feather : Write out feather-format for DataFrames.
DataFrame.to_csv : Write out to a csv file.
Examples
--------
>>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
... index=['a', 'b', 'c'])
>>> df.to_hdf('data.h5', key='df', mode='w')
We can add another object to the same file:
>>> s = pd.Series([1, 2, 3, 4])
>>> s.to_hdf('data.h5', key='s')
Reading from HDF file:
>>> pd.read_hdf('data.h5', 'df')
A B
a 1 4
b 2 5
c 3 6
>>> pd.read_hdf('data.h5', 's')
0 1
1 2
2 3
3 4
dtype: int64
Deleting file with data:
>>> import os
>>> os.remove('data.h5')
"""
from pandas.io import pytables
pytables.to_hdf(
path_or_buf,
key,
self,
mode=mode,
complevel=complevel,
complib=complib,
append=append,
format=format,
index=index,
min_itemsize=min_itemsize,
nan_rep=nan_rep,
dropna=dropna,
data_columns=data_columns,
errors=errors,
encoding=encoding,
)
def to_sql(
self,
name: str,
con,
schema=None,
if_exists: str = "fail",
index: bool_t = True,
index_label=None,
chunksize=None,
dtype=None,
method=None,
) -> None:
"""
Write records stored in a DataFrame to a SQL database.
Databases supported by SQLAlchemy [1]_ are supported. Tables can be
newly created, appended to, or overwritten.
Parameters
----------
name : str
Name of SQL table.
con : sqlalchemy.engine.Engine or sqlite3.Connection
Using SQLAlchemy makes it possible to use any DB supported by that
library. Legacy support is provided for sqlite3.Connection objects. The user
is responsible for engine disposal and connection closure for the SQLAlchemy
connectable See `here \
<https://docs.sqlalchemy.org/en/13/core/connections.html>`_
schema : str, optional
Specify the schema (if database flavor supports this). If None, use
default schema.
if_exists : {'fail', 'replace', 'append'}, default 'fail'
How to behave if the table already exists.
* fail: Raise a ValueError.
* replace: Drop the table before inserting new values.
* append: Insert new values to the existing table.
index : bool, default True
Write DataFrame index as a column. Uses `index_label` as the column
name in the table.
index_label : str or sequence, default None
Column label for index column(s). If None is given (default) and
`index` is True, then the index names are used.
A sequence should be given if the DataFrame uses MultiIndex.
chunksize : int, optional
Specify the number of rows in each batch to be written at a time.
By default, all rows will be written at once.
dtype : dict or scalar, optional
Specifying the datatype for columns. If a dictionary is used, the
keys should be the column names and the values should be the
SQLAlchemy types or strings for the sqlite3 legacy mode. If a
scalar is provided, it will be applied to all columns.
method : {None, 'multi', callable}, optional
Controls the SQL insertion clause used:
* None : Uses standard SQL ``INSERT`` clause (one per row).
* 'multi': Pass multiple values in a single ``INSERT`` clause.
* callable with signature ``(pd_table, conn, keys, data_iter)``.
Details and a sample callable implementation can be found in the
section :ref:`insert method <io.sql.method>`.
.. versionadded:: 0.24.0
Raises
------
ValueError
When the table already exists and `if_exists` is 'fail' (the
default).
See Also
--------
read_sql : Read a DataFrame from a table.
Notes
-----
Timezone aware datetime columns will be written as
``Timestamp with timezone`` type with SQLAlchemy if supported by the
database. Otherwise, the datetimes will be stored as timezone unaware
timestamps local to the original timezone.
.. versionadded:: 0.24.0
References
----------
.. [1] http://docs.sqlalchemy.org
.. [2] https://www.python.org/dev/peps/pep-0249/
Examples
--------
Create an in-memory SQLite database.
>>> from sqlalchemy import create_engine
>>> engine = create_engine('sqlite://', echo=False)
Create a table from scratch with 3 rows.
>>> df = pd.DataFrame({'name' : ['User 1', 'User 2', 'User 3']})
>>> df
name
0 User 1
1 User 2
2 User 3
>>> df.to_sql('users', con=engine)
>>> engine.execute("SELECT * FROM users").fetchall()
[(0, 'User 1'), (1, 'User 2'), (2, 'User 3')]
>>> df1 = pd.DataFrame({'name' : ['User 4', 'User 5']})
>>> df1.to_sql('users', con=engine, if_exists='append')
>>> engine.execute("SELECT * FROM users").fetchall()
[(0, 'User 1'), (1, 'User 2'), (2, 'User 3'),
(0, 'User 4'), (1, 'User 5')]
Overwrite the table with just ``df1``.
>>> df1.to_sql('users', con=engine, if_exists='replace',
... index_label='id')
>>> engine.execute("SELECT * FROM users").fetchall()
[(0, 'User 4'), (1, 'User 5')]
Specify the dtype (especially useful for integers with missing values).
Notice that while pandas is forced to store the data as floating point,
the database supports nullable integers. When fetching the data with
Python, we get back integer scalars.
>>> df = pd.DataFrame({"A": [1, None, 2]})
>>> df
A
0 1.0
1 NaN
2 2.0
>>> from sqlalchemy.types import Integer
>>> df.to_sql('integers', con=engine, index=False,
... dtype={"A": Integer()})
>>> engine.execute("SELECT * FROM integers").fetchall()
[(1,), (None,), (2,)]
"""
from pandas.io import sql
sql.to_sql(
self,
name,
con,
schema=schema,
if_exists=if_exists,
index=index,
index_label=index_label,
chunksize=chunksize,
dtype=dtype,
method=method,
)
def to_pickle(
self,
path,
compression: Optional[str] = "infer",
protocol: int = pickle.HIGHEST_PROTOCOL,
) -> None:
"""
Pickle (serialize) object to file.
Parameters
----------
path : str
File path where the pickled object will be stored.
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, \
default 'infer'
A string representing the compression to use in the output file. By
default, infers from the file extension in specified path.
protocol : int
Int which indicates which protocol should be used by the pickler,
default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible
values are 0, 1, 2, 3, 4. A negative value for the protocol
parameter is equivalent to setting its value to HIGHEST_PROTOCOL.
.. [1] https://docs.python.org/3/library/pickle.html.
.. versionadded:: 0.21.0.
See Also
--------
read_pickle : Load pickled pandas object (or any object) from file.
DataFrame.to_hdf : Write DataFrame to an HDF5 file.
DataFrame.to_sql : Write DataFrame to a SQL database.
DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
Examples
--------
>>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
>>> original_df
foo bar
0 0 5
1 1 6
2 2 7
3 3 8
4 4 9
>>> original_df.to_pickle("./dummy.pkl")
>>> unpickled_df = pd.read_pickle("./dummy.pkl")
>>> unpickled_df
foo bar
0 0 5
1 1 6
2 2 7
3 3 8
4 4 9
>>> import os
>>> os.remove("./dummy.pkl")
"""
from pandas.io.pickle import to_pickle
to_pickle(self, path, compression=compression, protocol=protocol)
def to_clipboard(
self, excel: bool_t = True, sep: Optional[str] = None, **kwargs
) -> None:
r"""
Copy object to the system clipboard.
Write a text representation of object to the system clipboard.
This can be pasted into Excel, for example.
Parameters
----------
excel : bool, default True
Produce output in a csv format for easy pasting into excel.
- True, use the provided separator for csv pasting.
- False, write a string representation of the object to the clipboard.
sep : str, default ``'\t'``
Field delimiter.
**kwargs
These parameters will be passed to DataFrame.to_csv.
See Also
--------
DataFrame.to_csv : Write a DataFrame to a comma-separated values
(csv) file.
read_clipboard : Read text from clipboard and pass to read_table.
Notes
-----
Requirements for your platform.
- Linux : `xclip`, or `xsel` (with `PyQt4` modules)
- Windows : none
- OS X : none
Examples
--------
Copy the contents of a DataFrame to the clipboard.
>>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C'])
>>> df.to_clipboard(sep=',')
... # Wrote the following to the system clipboard:
... # ,A,B,C
... # 0,1,2,3
... # 1,4,5,6
We can omit the the index by passing the keyword `index` and setting
it to false.
>>> df.to_clipboard(sep=',', index=False)
... # Wrote the following to the system clipboard:
... # A,B,C
... # 1,2,3
... # 4,5,6
"""
from pandas.io import clipboards
clipboards.to_clipboard(self, excel=excel, sep=sep, **kwargs)
def to_xarray(self):
"""
Return an xarray object from the pandas object.
Returns
-------
xarray.DataArray or xarray.Dataset
Data in the pandas structure converted to Dataset if the object is
a DataFrame, or a DataArray if the object is a Series.
See Also
--------
DataFrame.to_hdf : Write DataFrame to an HDF5 file.
DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
Notes
-----
See the `xarray docs <http://xarray.pydata.org/en/stable/>`__
Examples
--------
>>> df = pd.DataFrame([('falcon', 'bird', 389.0, 2),
... ('parrot', 'bird', 24.0, 2),
... ('lion', 'mammal', 80.5, 4),
... ('monkey', 'mammal', np.nan, 4)],
... columns=['name', 'class', 'max_speed',
... 'num_legs'])
>>> df
name class max_speed num_legs
0 falcon bird 389.0 2
1 parrot bird 24.0 2
2 lion mammal 80.5 4
3 monkey mammal NaN 4
>>> df.to_xarray()
<xarray.Dataset>
Dimensions: (index: 4)
Coordinates:
* index (index) int64 0 1 2 3
Data variables:
name (index) object 'falcon' 'parrot' 'lion' 'monkey'
class (index) object 'bird' 'bird' 'mammal' 'mammal'
max_speed (index) float64 389.0 24.0 80.5 nan
num_legs (index) int64 2 2 4 4
>>> df['max_speed'].to_xarray()
<xarray.DataArray 'max_speed' (index: 4)>
array([389. , 24. , 80.5, nan])
Coordinates:
* index (index) int64 0 1 2 3
>>> dates = pd.to_datetime(['2018-01-01', '2018-01-01',
... '2018-01-02', '2018-01-02'])
>>> df_multiindex = pd.DataFrame({'date': dates,
... 'animal': ['falcon', 'parrot',
... 'falcon', 'parrot'],
... 'speed': [350, 18, 361, 15]})
>>> df_multiindex = df_multiindex.set_index(['date', 'animal'])
>>> df_multiindex
speed
date animal
2018-01-01 falcon 350
parrot 18
2018-01-02 falcon 361
parrot 15
>>> df_multiindex.to_xarray()
<xarray.Dataset>
Dimensions: (animal: 2, date: 2)
Coordinates:
* date (date) datetime64[ns] 2018-01-01 2018-01-02
* animal (animal) object 'falcon' 'parrot'
Data variables:
speed (date, animal) int64 350 18 361 15
"""
xarray = import_optional_dependency("xarray")
if self.ndim == 1:
return xarray.DataArray.from_series(self)
else:
return xarray.Dataset.from_dataframe(self)
@Substitution(returns=fmt.return_docstring)
def to_latex(
self,
buf=None,
columns=None,
col_space=None,
header=True,
index=True,
na_rep="NaN",
formatters=None,
float_format=None,
sparsify=None,
index_names=True,
bold_rows=False,
column_format=None,
longtable=None,
escape=None,
encoding=None,
decimal=".",
multicolumn=None,
multicolumn_format=None,
multirow=None,
caption=None,
label=None,
):
r"""
Render object to a LaTeX tabular, longtable, or nested table/tabular.
Requires ``\usepackage{booktabs}``. The output can be copy/pasted
into a main LaTeX document or read from an external file
with ``\input{table.tex}``.
.. versionchanged:: 0.20.2
Added to Series.
.. versionchanged:: 1.0.0
Added caption and label arguments.
Parameters
----------
buf : str, Path or StringIO-like, optional, default None
Buffer to write to. If None, the output is returned as a string.
columns : list of label, optional
The subset of columns to write. Writes all columns by default.
col_space : int, optional
The minimum width of each column.
header : bool or list of str, default True
Write out the column names. If a list of strings is given,
it is assumed to be aliases for the column names.
index : bool, default True
Write row names (index).
na_rep : str, default 'NaN'
Missing data representation.
formatters : list of functions or dict of {str: function}, optional
Formatter functions to apply to columns' elements by position or
name. The result of each function must be a unicode string.
List must be of length equal to the number of columns.
float_format : one-parameter function or str, optional, default None
Formatter for floating point numbers. For example
``float_format="%%.2f"`` and ``float_format="{:0.2f}".format`` will
both result in 0.1234 being formatted as 0.12.
sparsify : bool, optional
Set to False for a DataFrame with a hierarchical index to print
every multiindex key at each row. By default, the value will be
read from the config module.
index_names : bool, default True
Prints the names of the indexes.
bold_rows : bool, default False
Make the row labels bold in the output.
column_format : str, optional
The columns format as specified in `LaTeX table format
<https://en.wikibooks.org/wiki/LaTeX/Tables>`__ e.g. 'rcl' for 3
columns. By default, 'l' will be used for all columns except
columns of numbers, which default to 'r'.
longtable : bool, optional
By default, the value will be read from the pandas config
module. Use a longtable environment instead of tabular. Requires
adding a \usepackage{longtable} to your LaTeX preamble.
escape : bool, optional
By default, the value will be read from the pandas config
module. When set to False prevents from escaping latex special
characters in column names.
encoding : str, optional
A string representing the encoding to use in the output file,
defaults to 'utf-8'.
decimal : str, default '.'
Character recognized as decimal separator, e.g. ',' in Europe.
multicolumn : bool, default True
Use \multicolumn to enhance MultiIndex columns.
The default will be read from the config module.
multicolumn_format : str, default 'l'
The alignment for multicolumns, similar to `column_format`
The default will be read from the config module.
multirow : bool, default False
Use \multirow to enhance MultiIndex rows. Requires adding a
\usepackage{multirow} to your LaTeX preamble. Will print
centered labels (instead of top-aligned) across the contained
rows, separating groups via clines. The default will be read
from the pandas config module.
caption : str, optional
The LaTeX caption to be placed inside ``\caption{}`` in the output.
.. versionadded:: 1.0.0
label : str, optional
The LaTeX label to be placed inside ``\label{}`` in the output.
This is used with ``\ref{}`` in the main ``.tex`` file.
.. versionadded:: 1.0.0
%(returns)s
See Also
--------
DataFrame.to_string : Render a DataFrame to a console-friendly
tabular output.
DataFrame.to_html : Render a DataFrame as an HTML table.
Examples
--------
>>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'],
... 'mask': ['red', 'purple'],
... 'weapon': ['sai', 'bo staff']})
>>> print(df.to_latex(index=False)) # doctest: +NORMALIZE_WHITESPACE
\begin{tabular}{lll}
\toprule
name & mask & weapon \\
\midrule
Raphael & red & sai \\
Donatello & purple & bo staff \\
\bottomrule
\end{tabular}
"""
# Get defaults from the pandas config
if self.ndim == 1:
self = self.to_frame()
if longtable is None:
longtable = config.get_option("display.latex.longtable")
if escape is None:
escape = config.get_option("display.latex.escape")
if multicolumn is None:
multicolumn = config.get_option("display.latex.multicolumn")
if multicolumn_format is None:
multicolumn_format = config.get_option("display.latex.multicolumn_format")
if multirow is None:
multirow = config.get_option("display.latex.multirow")
formatter = DataFrameFormatter(
self,
columns=columns,
col_space=col_space,
na_rep=na_rep,
header=header,
index=index,
formatters=formatters,
float_format=float_format,
bold_rows=bold_rows,
sparsify=sparsify,
index_names=index_names,
escape=escape,
decimal=decimal,
)
return formatter.to_latex(
buf=buf,
column_format=column_format,
longtable=longtable,
encoding=encoding,
multicolumn=multicolumn,
multicolumn_format=multicolumn_format,
multirow=multirow,
caption=caption,
label=label,
)
def to_csv(
self,
path_or_buf: Optional[FilePathOrBuffer] = None,
sep: str = ",",
na_rep: str = "",
float_format: Optional[str] = None,
columns: Optional[Sequence[Optional[Hashable]]] = None,
header: Union[bool_t, List[str]] = True,
index: bool_t = True,
index_label: Optional[Union[bool_t, str, Sequence[Optional[Hashable]]]] = None,
mode: str = "w",
encoding: Optional[str] = None,
compression: Optional[Union[str, Mapping[str, str]]] = "infer",
quoting: Optional[int] = None,
quotechar: str = '"',
line_terminator: Optional[str] = None,
chunksize: Optional[int] = None,
date_format: Optional[str] = None,
doublequote: bool_t = True,
escapechar: Optional[str] = None,
decimal: Optional[str] = ".",
) -> Optional[str]:
r"""
Write object to a comma-separated values (csv) file.
.. versionchanged:: 0.24.0
The order of arguments for Series was changed.
Parameters
----------
path_or_buf : str or file handle, default None
File path or object, if None is provided the result is returned as
a string. If a file object is passed it should be opened with
`newline=''`, disabling universal newlines.
.. versionchanged:: 0.24.0
Was previously named "path" for Series.
sep : str, default ','
String of length 1. Field delimiter for the output file.
na_rep : str, default ''
Missing data representation.
float_format : str, default None
Format string for floating point numbers.
columns : sequence, optional
Columns to write.
header : bool or list of str, default True
Write out the column names. If a list of strings is given it is
assumed to be aliases for the column names.
.. versionchanged:: 0.24.0
Previously defaulted to False for Series.
index : bool, default True
Write row names (index).
index_label : str or sequence, or False, default None
Column label for index column(s) if desired. If None is given, and
`header` and `index` are True, then the index names are used. A
sequence should be given if the object uses MultiIndex. If
False do not print fields for index names. Use index_label=False
for easier importing in R.
mode : str
Python write mode, default 'w'.
encoding : str, optional
A string representing the encoding to use in the output file,
defaults to 'utf-8'.
compression : str or dict, default 'infer'
If str, represents compression mode. If dict, value at 'method' is
the compression mode. Compression mode may be any of the following
possible values: {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If
compression mode is 'infer' and `path_or_buf` is path-like, then
detect compression mode from the following extensions: '.gz',
'.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given
and mode is 'zip' or inferred as 'zip', other entries passed as
additional compression options.
.. versionchanged:: 1.0.0
May now be a dict with key 'method' as compression mode
and other entries as additional compression options if
compression mode is 'zip'.
quoting : optional constant from csv module
Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
will treat them as non-numeric.
quotechar : str, default '\"'
String of length 1. Character used to quote fields.
line_terminator : str, optional
The newline character or character sequence to use in the output
file. Defaults to `os.linesep`, which depends on the OS in which
this method is called ('\n' for linux, '\r\n' for Windows, i.e.).
.. versionchanged:: 0.24.0
chunksize : int or None
Rows to write at a time.
date_format : str, default None
Format string for datetime objects.
doublequote : bool, default True
Control quoting of `quotechar` inside a field.
escapechar : str, default None
String of length 1. Character used to escape `sep` and `quotechar`
when appropriate.
decimal : str, default '.'
Character recognized as decimal separator. E.g. use ',' for
European data.
Returns
-------
None or str
If path_or_buf is None, returns the resulting csv format as a
string. Otherwise returns None.
See Also
--------
read_csv : Load a CSV file into a DataFrame.
to_excel : Write DataFrame to an Excel file.
Examples
--------
>>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'],
... 'mask': ['red', 'purple'],
... 'weapon': ['sai', 'bo staff']})
>>> df.to_csv(index=False)
'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'
Create 'out.zip' containing 'out.csv'
>>> compression_opts = dict(method='zip',
... archive_name='out.csv') # doctest: +SKIP
>>> df.to_csv('out.zip', index=False,
... compression=compression_opts) # doctest: +SKIP
"""
df = self if isinstance(self, ABCDataFrame) else self.to_frame()
from pandas.io.formats.csvs import CSVFormatter
formatter = CSVFormatter(
df,
path_or_buf,
line_terminator=line_terminator,
sep=sep,
encoding=encoding,
compression=compression,
quoting=quoting,
na_rep=na_rep,
float_format=float_format,
cols=columns,
header=header,
index=index,
index_label=index_label,
mode=mode,
chunksize=chunksize,
quotechar=quotechar,
date_format=date_format,
doublequote=doublequote,
escapechar=escapechar,
decimal=decimal,
)
formatter.save()
if path_or_buf is None:
return formatter.path_or_buf.getvalue()
return None
# ----------------------------------------------------------------------
# Fancy Indexing
@classmethod
def _create_indexer(cls, name: str, indexer) -> None:
"""Create an indexer like _name in the class.
Kept for compatibility with geopandas. To be removed in the future. See GH27258
"""
if getattr(cls, name, None) is None:
_indexer = functools.partial(indexer, name)
setattr(cls, name, property(_indexer, doc=indexer.__doc__))
# ----------------------------------------------------------------------
# Lookup Caching
def _set_as_cached(self, item, cacher) -> None:
"""Set the _cacher attribute on the calling object with a weakref to
cacher.
"""
self._cacher = (item, weakref.ref(cacher))
def _reset_cacher(self) -> None:
"""Reset the cacher."""
if hasattr(self, "_cacher"):
del self._cacher
def _maybe_cache_changed(self, item, value) -> None:
"""The object has called back to us saying maybe it has changed.
"""
self._data.set(item, value)
@property
def _is_cached(self) -> bool_t:
"""Return boolean indicating if self is cached or not."""
return getattr(self, "_cacher", None) is not None
def _get_cacher(self):
"""return my cacher or None"""
cacher = getattr(self, "_cacher", None)
if cacher is not None:
cacher = cacher[1]()
return cacher
def _maybe_update_cacher(
self, clear: bool_t = False, verify_is_copy: bool_t = True
) -> None:
"""
See if we need to update our parent cacher if clear, then clear our
cache.
Parameters
----------
clear : bool, default False
Clear the item cache.
verify_is_copy : bool, default True
Provide is_copy checks.
"""
cacher = getattr(self, "_cacher", None)
if cacher is not None:
ref = cacher[1]()
# we are trying to reference a dead referant, hence
# a copy
if ref is None:
del self._cacher
else:
# Note: we need to call ref._maybe_cache_changed even in the
# case where it will raise. (Uh, not clear why)
try:
ref._maybe_cache_changed(cacher[0], self)
except AssertionError:
# ref._data.setitem can raise
# AssertionError because of shape mismatch
pass
if verify_is_copy:
self._check_setitem_copy(stacklevel=5, t="referant")
if clear:
self._clear_item_cache()
def _clear_item_cache(self) -> None:
self._item_cache.clear()
# ----------------------------------------------------------------------
# Indexing Methods
def take(
self: FrameOrSeries, indices, axis=0, is_copy: Optional[bool_t] = None, **kwargs
) -> FrameOrSeries:
"""
Return the elements in the given *positional* indices along an axis.
This means that we are not indexing according to actual values in
the index attribute of the object. We are indexing according to the
actual position of the element in the object.
Parameters
----------
indices : array-like
An array of ints indicating which positions to take.
axis : {0 or 'index', 1 or 'columns', None}, default 0
The axis on which to select elements. ``0`` means that we are
selecting rows, ``1`` means that we are selecting columns.
is_copy : bool
Before pandas 1.0, ``is_copy=False`` can be specified to ensure
that the return value is an actual copy. Starting with pandas 1.0,
``take`` always returns a copy, and the keyword is therefore
deprecated.
.. deprecated:: 1.0.0
**kwargs
For compatibility with :meth:`numpy.take`. Has no effect on the
output.
Returns
-------
taken : same type as caller
An array-like containing the elements taken from the object.
See Also
--------
DataFrame.loc : Select a subset of a DataFrame by labels.
DataFrame.iloc : Select a subset of a DataFrame by positions.
numpy.take : Take elements from an array along an axis.
Examples
--------
>>> df = pd.DataFrame([('falcon', 'bird', 389.0),
... ('parrot', 'bird', 24.0),
... ('lion', 'mammal', 80.5),
... ('monkey', 'mammal', np.nan)],
... columns=['name', 'class', 'max_speed'],
... index=[0, 2, 3, 1])
>>> df
name class max_speed
0 falcon bird 389.0
2 parrot bird 24.0
3 lion mammal 80.5
1 monkey mammal NaN
Take elements at positions 0 and 3 along the axis 0 (default).
Note how the actual indices selected (0 and 1) do not correspond to
our selected indices 0 and 3. That's because we are selecting the 0th
and 3rd rows, not rows whose indices equal 0 and 3.
>>> df.take([0, 3])
name class max_speed
0 falcon bird 389.0
1 monkey mammal NaN
Take elements at indices 1 and 2 along the axis 1 (column selection).
>>> df.take([1, 2], axis=1)
class max_speed
0 bird 389.0
2 bird 24.0
3 mammal 80.5
1 mammal NaN
We may take elements using negative integers for positive indices,
starting from the end of the object, just like with Python lists.
>>> df.take([-1, -2])
name class max_speed
1 monkey mammal NaN
3 lion mammal 80.5
"""
if is_copy is not None:
warnings.warn(
"is_copy is deprecated and will be removed in a future version. "
"'take' always returns a copy, so there is no need to specify this.",
FutureWarning,
stacklevel=2,
)
nv.validate_take(tuple(), kwargs)
self._consolidate_inplace()
new_data = self._data.take(
indices, axis=self._get_block_manager_axis(axis), verify=True
)
return self._constructor(new_data).__finalize__(self)
def _take_with_is_copy(
self: FrameOrSeries, indices, axis=0, **kwargs
) -> FrameOrSeries:
"""
Internal version of the `take` method that sets the `_is_copy`
attribute to keep track of the parent dataframe (using in indexing
for the SettingWithCopyWarning).
See the docstring of `take` for full explanation of the parameters.
"""
result = self.take(indices=indices, axis=axis, **kwargs)
# Maybe set copy if we didn't actually change the index.
if not result._get_axis(axis).equals(self._get_axis(axis)):
result._set_is_copy(self)
return result
def xs(self, key, axis=0, level=None, drop_level: bool_t = True):
"""
Return cross-section from the Series/DataFrame.
This method takes a `key` argument to select data at a particular
level of a MultiIndex.
Parameters
----------
key : label or tuple of label
Label contained in the index, or partially in a MultiIndex.
axis : {0 or 'index', 1 or 'columns'}, default 0
Axis to retrieve cross-section on.
level : object, defaults to first n levels (n=1 or len(key))
In case of a key partially contained in a MultiIndex, indicate
which levels are used. Levels can be referred by label or position.
drop_level : bool, default True
If False, returns object with same levels as self.
Returns
-------
Series or DataFrame
Cross-section from the original Series or DataFrame
corresponding to the selected index levels.
See Also
--------
DataFrame.loc : Access a group of rows and columns
by label(s) or a boolean array.
DataFrame.iloc : Purely integer-location based indexing
for selection by position.
Notes
-----
`xs` can not be used to set values.
MultiIndex Slicers is a generic way to get/set values on
any level or levels.
It is a superset of `xs` functionality, see
:ref:`MultiIndex Slicers <advanced.mi_slicers>`.
Examples
--------
>>> d = {'num_legs': [4, 4, 2, 2],
... 'num_wings': [0, 0, 2, 2],
... 'class': ['mammal', 'mammal', 'mammal', 'bird'],
... 'animal': ['cat', 'dog', 'bat', 'penguin'],
... 'locomotion': ['walks', 'walks', 'flies', 'walks']}
>>> df = pd.DataFrame(data=d)
>>> df = df.set_index(['class', 'animal', 'locomotion'])
>>> df
num_legs num_wings
class animal locomotion
mammal cat walks 4 0
dog walks 4 0
bat flies 2 2
bird penguin walks 2 2
Get values at specified index
>>> df.xs('mammal')
num_legs num_wings
animal locomotion
cat walks 4 0
dog walks 4 0
bat flies 2 2
Get values at several indexes
>>> df.xs(('mammal', 'dog'))
num_legs num_wings
locomotion
walks 4 0
Get values at specified index and level
>>> df.xs('cat', level=1)
num_legs num_wings
class locomotion
mammal walks 4 0
Get values at several indexes and levels
>>> df.xs(('bird', 'walks'),
... level=[0, 'locomotion'])
num_legs num_wings
animal
penguin 2 2
Get values at specified column and axis
>>> df.xs('num_wings', axis=1)
class animal locomotion
mammal cat walks 0
dog walks 0
bat flies 2
bird penguin walks 2
Name: num_wings, dtype: int64
"""
axis = self._get_axis_number(axis)
labels = self._get_axis(axis)
if level is not None:
loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level)
# create the tuple of the indexer
_indexer = [slice(None)] * self.ndim
_indexer[axis] = loc
indexer = tuple(_indexer)
result = self.iloc[indexer]
setattr(result, result._get_axis_name(axis), new_ax)
return result
if axis == 1:
return self[key]
self._consolidate_inplace()
index = self.index
if isinstance(index, MultiIndex):
loc, new_index = self.index.get_loc_level(key, drop_level=drop_level)
else:
loc = self.index.get_loc(key)
if isinstance(loc, np.ndarray):
if loc.dtype == np.bool_:
(inds,) = loc.nonzero()
return self._take_with_is_copy(inds, axis=axis)
else:
return self._take_with_is_copy(loc, axis=axis)
if not is_scalar(loc):
new_index = self.index[loc]
if is_scalar(loc):
new_values = self._data.fast_xs(loc)
# may need to box a datelike-scalar
#
# if we encounter an array-like and we only have 1 dim
# that means that their are list/ndarrays inside the Series!
# so just return them (GH 6394)
if not is_list_like(new_va