Permalink
bdb7a16 Sep 21, 2018
311 contributors

Users who have contributed to this file

@wesm @jreback @adamklein @y-p @TomAugspurger @cpcloud @changhiskhan @jorisvandenbossche @jtratner @lodagro @gfyoung @jbrockmendel @takluyver @sinhrks @toobaz @hayd @topper-123 @jseabold @mroeschke @reidy-p @mortada @immerrr @alphaCTzo7G @chris-b1 @bashtage @aflaxman
7904 lines (6606 sloc) 280 KB
"""
DataFrame
---------
An efficient 2D container for potentially mixed-type time series or other
labeled data series.
Similar to its R counterpart, data.frame, except providing automatic data
alignment and a host of useful data manipulation methods having to do with the
labeling information
"""
from __future__ import division
# pylint: disable=E1101,E1103
# pylint: disable=W0212,W0231,W0703,W0622
import functools
import collections
import itertools
import sys
import warnings
from textwrap import dedent
import numpy as np
import numpy.ma as ma
from pandas.core.accessor import CachedAccessor
from pandas.core.dtypes.cast import (
maybe_upcast,
cast_scalar_to_array,
construct_1d_arraylike_from_scalar,
infer_dtype_from_scalar,
maybe_cast_to_datetime,
maybe_infer_to_datetimelike,
maybe_convert_platform,
maybe_downcast_to_dtype,
invalidate_string_dtypes,
coerce_to_dtypes,
maybe_upcast_putmask,
find_common_type)
from pandas.core.dtypes.common import (
is_categorical_dtype,
is_object_dtype,
is_extension_type,
is_extension_array_dtype,
is_datetimetz,
is_datetime64_any_dtype,
is_bool_dtype,
is_integer_dtype,
is_float_dtype,
is_integer,
is_scalar,
is_dtype_equal,
needs_i8_conversion,
_get_dtype_from_object,
ensure_float64,
ensure_int64,
ensure_platform_int,
is_list_like,
is_nested_list_like,
is_iterator,
is_sequence,
is_named_tuple)
from pandas.core.dtypes.concat import _get_sliced_frame_result_type
from pandas.core.dtypes.missing import isna, notna
from pandas.core.generic import NDFrame, _shared_docs
from pandas.core.index import (Index, MultiIndex, ensure_index,
ensure_index_from_sequences)
from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable,
check_bool_indexer)
from pandas.core.internals import (BlockManager,
create_block_manager_from_arrays,
create_block_manager_from_blocks)
from pandas.core.series import Series
from pandas.core.arrays import Categorical, ExtensionArray
import pandas.core.algorithms as algorithms
from pandas.compat import (range, map, zip, lrange, lmap, lzip, StringIO, u,
OrderedDict, raise_with_traceback,
string_and_binary_types)
from pandas import compat
from pandas.compat import PY36
from pandas.compat.numpy import function as nv
from pandas.util._decorators import (Appender, Substitution,
rewrite_axis_style_signature,
deprecate_kwarg)
from pandas.util._validators import (validate_bool_kwarg,
validate_axis_style_args)
from pandas.core.indexes.period import PeriodIndex
from pandas.core.indexes.datetimes import DatetimeIndex
from pandas.core.indexes.timedeltas import TimedeltaIndex
import pandas.core.indexes.base as ibase
import pandas.core.common as com
import pandas.core.nanops as nanops
import pandas.core.ops as ops
import pandas.io.formats.console as console
import pandas.io.formats.format as fmt
from pandas.io.formats.printing import pprint_thing
import pandas.plotting._core as gfx
from pandas._libs import lib, algos as libalgos
from pandas.core.config import get_option
# ---------------------------------------------------------------------
# Docstring templates
_shared_doc_kwargs = dict(
axes='index, columns', klass='DataFrame',
axes_single_arg="{0 or 'index', 1 or 'columns'}",
axis="""axis : {0 or 'index', 1 or 'columns'}, default 0
If 0 or 'index': apply function to each column.
If 1 or 'columns': apply function to each row.""",
optional_by="""
by : str or list of str
Name or list of names to sort by.
- if `axis` is 0 or `'index'` then `by` may contain index
levels and/or column labels
- if `axis` is 1 or `'columns'` then `by` may contain column
levels and/or index labels
.. versionchanged:: 0.23.0
Allow specifying index or column level names.""",
versionadded_to_excel='',
optional_labels="""labels : array-like, optional
New labels / index to conform the axis specified by 'axis' to.""",
optional_axis="""axis : int or str, optional
Axis to target. Can be either the axis name ('index', 'columns')
or number (0, 1).""",
)
_numeric_only_doc = """numeric_only : boolean, default None
Include only float, int, boolean data. If None, will attempt to use
everything, then use only numeric data
"""
_merge_doc = """
Merge DataFrame or named Series objects with a database-style join.
The join is done on columns or indexes. If joining columns on
columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
on indexes or indexes on a column or columns, the index will be passed on.
Parameters
----------%s
right : DataFrame or named Series
Object to merge with.
how : {'left', 'right', 'outer', 'inner'}, default 'inner'
Type of merge to be performed.
* left: use only keys from left frame, similar to a SQL left outer join;
preserve key order.
* right: use only keys from right frame, similar to a SQL right outer join;
preserve key order.
* outer: use union of keys from both frames, similar to a SQL full outer
join; sort keys lexicographically.
* inner: use intersection of keys from both frames, similar to a SQL inner
join; preserve the order of the left keys.
on : label or list
Column or index level names to join on. These must be found in both
DataFrames. If `on` is None and not merging on indexes then this defaults
to the intersection of the columns in both DataFrames.
left_on : label or list, or array-like
Column or index level names to join on in the left DataFrame. Can also
be an array or list of arrays of the length of the left DataFrame.
These arrays are treated as if they are columns.
right_on : label or list, or array-like
Column or index level names to join on in the right DataFrame. Can also
be an array or list of arrays of the length of the right DataFrame.
These arrays are treated as if they are columns.
left_index : bool, default False
Use the index from the left DataFrame as the join key(s). If it is a
MultiIndex, the number of keys in the other DataFrame (either the index
or a number of columns) must match the number of levels.
right_index : bool, default False
Use the index from the right DataFrame as the join key. Same caveats as
left_index.
sort : bool, default False
Sort the join keys lexicographically in the result DataFrame. If False,
the order of the join keys depends on the join type (how keyword).
suffixes : tuple of (str, str), default ('_x', '_y')
Suffix to apply to overlapping column names in the left and right
side, respectively. To raise an exception on overlapping columns use
(False, False).
copy : bool, default True
If False, avoid copy if possible.
indicator : bool or str, default False
If True, adds a column to output DataFrame called "_merge" with
information on the source of each row.
If string, column with information on source of each row will be added to
output DataFrame, and column will be named value of string.
Information column is Categorical-type and takes on a value of "left_only"
for observations whose merge key only appears in 'left' DataFrame,
"right_only" for observations whose merge key only appears in 'right'
DataFrame, and "both" if the observation's merge key is found in both.
validate : str, optional
If specified, checks if merge is of specified type.
* "one_to_one" or "1:1": check if merge keys are unique in both
left and right datasets.
* "one_to_many" or "1:m": check if merge keys are unique in left
dataset.
* "many_to_one" or "m:1": check if merge keys are unique in right
dataset.
* "many_to_many" or "m:m": allowed, but does not result in checks.
.. versionadded:: 0.21.0
Returns
-------
DataFrame
A DataFrame of the two merged objects.
Notes
-----
Support for specifying index levels as the `on`, `left_on`, and
`right_on` parameters was added in version 0.23.0
Support for merging named Series objects was added in version 0.24.0
See Also
--------
merge_ordered : merge with optional filling/interpolation.
merge_asof : merge on nearest keys.
DataFrame.join : similar method using indices.
Examples
--------
>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
... 'value': [1, 2, 3, 5]})
>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
... 'value': [5, 6, 7, 8]})
>>> df1
lkey value
0 foo 1
1 bar 2
2 baz 3
3 foo 5
>>> df2
rkey value
0 foo 5
1 bar 6
2 baz 7
3 foo 8
Merge df1 and df2 on the lkey and rkey columns. The value columns have
the default suffixes, _x and _y, appended.
>>> df1.merge(df2, left_on='lkey', right_on='rkey')
lkey value_x rkey value_y
0 foo 1 foo 5
1 foo 1 foo 8
2 foo 5 foo 5
3 foo 5 foo 8
4 bar 2 bar 6
5 baz 3 baz 7
Merge DataFrames df1 and df2 with specified left and right suffixes
appended to any overlapping columns.
>>> df1.merge(df2, left_on='lkey', right_on='rkey',
... suffixes=('_left', '_right'))
lkey value_left rkey value_right
0 foo 1 foo 5
1 foo 1 foo 8
2 foo 5 foo 5
3 foo 5 foo 8
4 bar 2 bar 6
5 baz 3 baz 7
Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
any overlapping columns.
>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))
Traceback (most recent call last):
...
ValueError: columns overlap but no suffix specified:
Index(['value'], dtype='object')
"""
# -----------------------------------------------------------------------
# DataFrame class
class DataFrame(NDFrame):
""" Two-dimensional size-mutable, potentially heterogeneous tabular data
structure with labeled axes (rows and columns). Arithmetic operations
align on both row and column labels. Can be thought of as a dict-like
container for Series objects. The primary pandas data structure.
Parameters
----------
data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
Dict can contain Series, arrays, constants, or list-like objects
.. versionchanged :: 0.23.0
If data is a dict, argument order is maintained for Python 3.6
and later.
index : Index or array-like
Index to use for resulting frame. Will default to RangeIndex if
no indexing information part of input data and no index provided
columns : Index or array-like
Column labels to use for resulting frame. Will default to
RangeIndex (0, 1, 2, ..., n) if no column labels are provided
dtype : dtype, default None
Data type to force. Only a single dtype is allowed. If None, infer
copy : boolean, default False
Copy data from inputs. Only affects DataFrame / 2d ndarray input
Examples
--------
Constructing DataFrame from a dictionary.
>>> d = {'col1': [1, 2], 'col2': [3, 4]}
>>> df = pd.DataFrame(data=d)
>>> df
col1 col2
0 1 3
1 2 4
Notice that the inferred dtype is int64.
>>> df.dtypes
col1 int64
col2 int64
dtype: object
To enforce a single dtype:
>>> df = pd.DataFrame(data=d, dtype=np.int8)
>>> df.dtypes
col1 int8
col2 int8
dtype: object
Constructing DataFrame from numpy ndarray:
>>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
... columns=['a', 'b', 'c'])
>>> df2
a b c
0 1 2 3
1 4 5 6
2 7 8 9
See also
--------
DataFrame.from_records : constructor from tuples, also record arrays
DataFrame.from_dict : from dicts of Series, arrays, or dicts
DataFrame.from_items : from sequence of (key, value) pairs
pandas.read_csv, pandas.read_table, pandas.read_clipboard
"""
@property
def _constructor(self):
return DataFrame
_constructor_sliced = Series
_deprecations = NDFrame._deprecations | frozenset(
['sortlevel', 'get_value', 'set_value', 'from_csv', 'from_items'])
_accessors = set()
@property
def _constructor_expanddim(self):
from pandas.core.panel import Panel
return Panel
def __init__(self, data=None, index=None, columns=None, dtype=None,
copy=False):
if data is None:
data = {}
if dtype is not None:
dtype = self._validate_dtype(dtype)
if isinstance(data, DataFrame):
data = data._data
if isinstance(data, BlockManager):
mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
dtype=dtype, copy=copy)
elif isinstance(data, dict):
mgr = self._init_dict(data, index, columns, dtype=dtype)
elif isinstance(data, ma.MaskedArray):
import numpy.ma.mrecords as mrecords
# masked recarray
if isinstance(data, mrecords.MaskedRecords):
mgr = _masked_rec_array_to_mgr(data, index, columns, dtype,
copy)
# a masked array
else:
mask = ma.getmaskarray(data)
if mask.any():
data, fill_value = maybe_upcast(data, copy=True)
data[mask] = fill_value
else:
data = data.copy()
mgr = self._init_ndarray(data, index, columns, dtype=dtype,
copy=copy)
elif isinstance(data, (np.ndarray, Series, Index)):
if data.dtype.names:
data_columns = list(data.dtype.names)
data = {k: data[k] for k in data_columns}
if columns is None:
columns = data_columns
mgr = self._init_dict(data, index, columns, dtype=dtype)
elif getattr(data, 'name', None) is not None:
mgr = self._init_dict({data.name: data}, index, columns,
dtype=dtype)
else:
mgr = self._init_ndarray(data, index, columns, dtype=dtype,
copy=copy)
# For data is list-like, or Iterable (will consume into list)
elif (isinstance(data, compat.Iterable)
and not isinstance(data, string_and_binary_types)):
if not isinstance(data, compat.Sequence):
data = list(data)
if len(data) > 0:
if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1:
if is_named_tuple(data[0]) and columns is None:
columns = data[0]._fields
arrays, columns = _to_arrays(data, columns, dtype=dtype)
columns = ensure_index(columns)
# set the index
if index is None:
if isinstance(data[0], Series):
index = _get_names_from_index(data)
elif isinstance(data[0], Categorical):
index = ibase.default_index(len(data[0]))
else:
index = ibase.default_index(len(data))
mgr = _arrays_to_mgr(arrays, columns, index, columns,
dtype=dtype)
else:
mgr = self._init_ndarray(data, index, columns, dtype=dtype,
copy=copy)
else:
mgr = self._init_dict({}, index, columns, dtype=dtype)
else:
try:
arr = np.array(data, dtype=dtype, copy=copy)
except (ValueError, TypeError) as e:
exc = TypeError('DataFrame constructor called with '
'incompatible data and dtype: {e}'.format(e=e))
raise_with_traceback(exc)
if arr.ndim == 0 and index is not None and columns is not None:
values = cast_scalar_to_array((len(index), len(columns)),
data, dtype=dtype)
mgr = self._init_ndarray(values, index, columns,
dtype=values.dtype, copy=False)
else:
raise ValueError('DataFrame constructor not properly called!')
NDFrame.__init__(self, mgr, fastpath=True)
def _init_dict(self, data, index, columns, dtype=None):
"""
Segregate Series based on type and coerce into matrices.
Needs to handle a lot of exceptional cases.
"""
if columns is not None:
arrays = Series(data, index=columns, dtype=object)
data_names = arrays.index
missing = arrays.isnull()
if index is None:
# GH10856
# raise ValueError if only scalars in dict
index = extract_index(arrays[~missing])
else:
index = ensure_index(index)
# no obvious "empty" int column
if missing.any() and not is_integer_dtype(dtype):
if dtype is None or np.issubdtype(dtype, np.flexible):
# 1783
nan_dtype = object
else:
nan_dtype = dtype
v = construct_1d_arraylike_from_scalar(np.nan, len(index),
nan_dtype)
arrays.loc[missing] = [v] * missing.sum()
else:
keys = com.dict_keys_to_ordered_list(data)
columns = data_names = Index(keys)
arrays = [data[k] for k in keys]
return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
def _init_ndarray(self, values, index, columns, dtype=None, copy=False):
# input must be a ndarray, list, Series, index
if isinstance(values, Series):
if columns is None:
if values.name is not None:
columns = [values.name]
if index is None:
index = values.index
else:
values = values.reindex(index)
# zero len case (GH #2234)
if not len(values) and columns is not None and len(columns):
values = np.empty((0, 1), dtype=object)
# helper to create the axes as indexes
def _get_axes(N, K, index=index, columns=columns):
# return axes or defaults
if index is None:
index = ibase.default_index(N)
else:
index = ensure_index(index)
if columns is None:
columns = ibase.default_index(K)
else:
columns = ensure_index(columns)
return index, columns
# we could have a categorical type passed or coerced to 'category'
# recast this to an _arrays_to_mgr
if (is_categorical_dtype(getattr(values, 'dtype', None)) or
is_categorical_dtype(dtype)):
if not hasattr(values, 'dtype'):
values = _prep_ndarray(values, copy=copy)
values = values.ravel()
elif copy:
values = values.copy()
index, columns = _get_axes(len(values), 1)
return _arrays_to_mgr([values], columns, index, columns,
dtype=dtype)
elif (is_datetimetz(values) or is_extension_array_dtype(values)):
# GH19157
if columns is None:
columns = [0]
return _arrays_to_mgr([values], columns, index, columns,
dtype=dtype)
# by definition an array here
# the dtypes will be coerced to a single dtype
values = _prep_ndarray(values, copy=copy)
if dtype is not None:
if not is_dtype_equal(values.dtype, dtype):
try:
values = values.astype(dtype)
except Exception as orig:
e = ValueError("failed to cast to '{dtype}' (Exception "
"was: {orig})".format(dtype=dtype,
orig=orig))
raise_with_traceback(e)
index, columns = _get_axes(*values.shape)
values = values.T
# if we don't have a dtype specified, then try to convert objects
# on the entire block; this is to convert if we have datetimelike's
# embedded in an object type
if dtype is None and is_object_dtype(values):
values = maybe_infer_to_datetimelike(values)
return create_block_manager_from_blocks([values], [columns, index])
@property
def axes(self):
"""
Return a list representing the axes of the DataFrame.
It has the row axis labels and column axis labels as the only members.
They are returned in that order.
Examples
--------
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.axes
[RangeIndex(start=0, stop=2, step=1), Index(['coll', 'col2'],
dtype='object')]
"""
return [self.index, self.columns]
@property
def shape(self):
"""
Return a tuple representing the dimensionality of the DataFrame.
See Also
--------
ndarray.shape
Examples
--------
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df.shape
(2, 2)
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],
... 'col3': [5, 6]})
>>> df.shape
(2, 3)
"""
return len(self.index), len(self.columns)
@property
def _is_homogeneous(self):
"""
Whether all the columns in a DataFrame have the same type.
Returns
-------
bool
Examples
--------
>>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous
True
>>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous
False
Items with the same type but different sizes are considered
different types.
>>> DataFrame({"A": np.array([1, 2], dtype=np.int32),
... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous
False
"""
if self._data.any_extension_types:
return len({block.dtype for block in self._data.blocks}) == 1
else:
return not self._data.is_mixed_type
def _repr_fits_vertical_(self):
"""
Check length against max_rows.
"""
max_rows = get_option("display.max_rows")
return len(self) <= max_rows
def _repr_fits_horizontal_(self, ignore_width=False):
"""
Check if full repr fits in horizontal boundaries imposed by the display
options width and max_columns. In case off non-interactive session, no
boundaries apply.
ignore_width is here so ipnb+HTML output can behave the way
users expect. display.max_columns remains in effect.
GH3541, GH3573
"""
width, height = console.get_console_size()
max_columns = get_option("display.max_columns")
nb_columns = len(self.columns)
# exceed max columns
if ((max_columns and nb_columns > max_columns) or
((not ignore_width) and width and nb_columns > (width // 2))):
return False
# used by repr_html under IPython notebook or scripts ignore terminal
# dims
if ignore_width or not console.in_interactive_session():
return True
if (get_option('display.width') is not None or
console.in_ipython_frontend()):
# check at least the column row for excessive width
max_rows = 1
else:
max_rows = get_option("display.max_rows")
# when auto-detecting, so width=None and not in ipython front end
# check whether repr fits horizontal by actually checking
# the width of the rendered repr
buf = StringIO()
# only care about the stuff we'll actually print out
# and to_string on entire frame may be expensive
d = self
if not (max_rows is None): # unlimited rows
# min of two, where one may be None
d = d.iloc[:min(max_rows, len(d))]
else:
return True
d.to_string(buf=buf)
value = buf.getvalue()
repr_width = max(len(l) for l in value.split('\n'))
return repr_width < width
def _info_repr(self):
"""True if the repr should show the info view."""
info_repr_option = (get_option("display.large_repr") == "info")
return info_repr_option and not (self._repr_fits_horizontal_() and
self._repr_fits_vertical_())
def __unicode__(self):
"""
Return a string representation for a particular DataFrame
Invoked by unicode(df) in py2 only. Yields a Unicode String in both
py2/py3.
"""
buf = StringIO(u(""))
if self._info_repr():
self.info(buf=buf)
return buf.getvalue()
max_rows = get_option("display.max_rows")
max_cols = get_option("display.max_columns")
show_dimensions = get_option("display.show_dimensions")
if get_option("display.expand_frame_repr"):
width, _ = console.get_console_size()
else:
width = None
self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols,
line_width=width, show_dimensions=show_dimensions)
return buf.getvalue()
def _repr_html_(self):
"""
Return a html representation for a particular DataFrame.
Mainly for IPython notebook.
"""
# qtconsole doesn't report its line width, and also
# behaves badly when outputting an HTML table
# that doesn't fit the window, so disable it.
# XXX: In IPython 3.x and above, the Qt console will not attempt to
# display HTML, so this check can be removed when support for
# IPython 2.x is no longer needed.
if console.in_qtconsole():
# 'HTML output is disabled in QtConsole'
return None
if self._info_repr():
buf = StringIO(u(""))
self.info(buf=buf)
# need to escape the <class>, should be the first line.
val = buf.getvalue().replace('<', r'&lt;', 1)
val = val.replace('>', r'&gt;', 1)
return '<pre>' + val + '</pre>'
if get_option("display.notebook_repr_html"):
max_rows = get_option("display.max_rows")
max_cols = get_option("display.max_columns")
show_dimensions = get_option("display.show_dimensions")
return self.to_html(max_rows=max_rows, max_cols=max_cols,
show_dimensions=show_dimensions, notebook=True)
else:
return None
@property
def style(self):
"""
Property returning a Styler object containing methods for
building a styled HTML representation fo the DataFrame.
See Also
--------
pandas.io.formats.style.Styler
"""
from pandas.io.formats.style import Styler
return Styler(self)
def iteritems(self):
"""
Iterator over (column name, Series) pairs.
See also
--------
iterrows : Iterate over DataFrame rows as (index, Series) pairs.
itertuples : Iterate over DataFrame rows as namedtuples of the values.
"""
if self.columns.is_unique and hasattr(self, '_item_cache'):
for k in self.columns:
yield k, self._get_item_cache(k)
else:
for i, k in enumerate(self.columns):
yield k, self._ixs(i, axis=1)
def iterrows(self):
"""
Iterate over DataFrame rows as (index, Series) pairs.
Notes
-----
1. Because ``iterrows`` returns a Series for each row,
it does **not** preserve dtypes across the rows (dtypes are
preserved across columns for DataFrames). For example,
>>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
>>> row = next(df.iterrows())[1]
>>> row
int 1.0
float 1.5
Name: 0, dtype: float64
>>> print(row['int'].dtype)
float64
>>> print(df['int'].dtype)
int64
To preserve dtypes while iterating over the rows, it is better
to use :meth:`itertuples` which returns namedtuples of the values
and which is generally faster than ``iterrows``.
2. You should **never modify** something you are iterating over.
This is not guaranteed to work in all cases. Depending on the
data types, the iterator returns a copy and not a view, and writing
to it will have no effect.
Returns
-------
it : generator
A generator that iterates over the rows of the frame.
See also
--------
itertuples : Iterate over DataFrame rows as namedtuples of the values.
iteritems : Iterate over (column name, Series) pairs.
"""
columns = self.columns
klass = self._constructor_sliced
for k, v in zip(self.index, self.values):
s = klass(v, index=columns, name=k)
yield k, s
def itertuples(self, index=True, name="Pandas"):
"""
Iterate over DataFrame rows as namedtuples, with index value as first
element of the tuple.
Parameters
----------
index : boolean, default True
If True, return the index as the first element of the tuple.
name : string, default "Pandas"
The name of the returned namedtuples or None to return regular
tuples.
Notes
-----
The column names will be renamed to positional names if they are
invalid Python identifiers, repeated, or start with an underscore.
With a large number of columns (>255), regular tuples are returned.
See also
--------
iterrows : Iterate over DataFrame rows as (index, Series) pairs.
iteritems : Iterate over (column name, Series) pairs.
Examples
--------
>>> df = pd.DataFrame({'col1': [1, 2], 'col2': [0.1, 0.2]},
index=['a', 'b'])
>>> df
col1 col2
a 1 0.1
b 2 0.2
>>> for row in df.itertuples():
... print(row)
...
Pandas(Index='a', col1=1, col2=0.10000000000000001)
Pandas(Index='b', col1=2, col2=0.20000000000000001)
"""
arrays = []
fields = []
if index:
arrays.append(self.index)
fields.append("Index")
# use integer indexing because of possible duplicate column names
arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
# Python 3 supports at most 255 arguments to constructor, and
# things get slow with this many fields in Python 2
if name is not None and len(self.columns) + index < 256:
# `rename` is unsupported in Python 2.6
try:
itertuple = collections.namedtuple(name,
fields + list(self.columns),
rename=True)
return map(itertuple._make, zip(*arrays))
except Exception:
pass
# fallback to regular tuples
return zip(*arrays)
items = iteritems
def __len__(self):
"""Returns length of info axis, but here we use the index """
return len(self.index)
def dot(self, other):
"""
Matrix multiplication with DataFrame or Series objects. Can also be
called using `self @ other` in Python >= 3.5.
Parameters
----------
other : DataFrame or Series
Returns
-------
dot_product : DataFrame or Series
"""
if isinstance(other, (Series, DataFrame)):
common = self.columns.union(other.index)
if (len(common) > len(self.columns) or
len(common) > len(other.index)):
raise ValueError('matrices are not aligned')
left = self.reindex(columns=common, copy=False)
right = other.reindex(index=common, copy=False)
lvals = left.values
rvals = right.values
else:
left = self
lvals = self.values
rvals = np.asarray(other)
if lvals.shape[1] != rvals.shape[0]:
raise ValueError('Dot product shape mismatch, '
'{l} vs {r}'.format(l=lvals.shape,
r=rvals.shape))
if isinstance(other, DataFrame):
return self._constructor(np.dot(lvals, rvals), index=left.index,
columns=other.columns)
elif isinstance(other, Series):
return Series(np.dot(lvals, rvals), index=left.index)
elif isinstance(rvals, (np.ndarray, Index)):
result = np.dot(lvals, rvals)
if result.ndim == 2:
return self._constructor(result, index=left.index)
else:
return Series(result, index=left.index)
else: # pragma: no cover
raise TypeError('unsupported type: {oth}'.format(oth=type(other)))
def __matmul__(self, other):
""" Matrix multiplication using binary `@` operator in Python>=3.5 """
return self.dot(other)
def __rmatmul__(self, other):
""" Matrix multiplication using binary `@` operator in Python>=3.5 """
return self.T.dot(np.transpose(other)).T
# ----------------------------------------------------------------------
# IO methods (to / from other formats)
@classmethod
def from_dict(cls, data, orient='columns', dtype=None, columns=None):
"""
Construct DataFrame from dict of array-like or dicts.
Creates DataFrame object from dictionary by columns or by index
allowing dtype specification.
Parameters
----------
data : dict
Of the form {field : array-like} or {field : dict}.
orient : {'columns', 'index'}, default 'columns'
The "orientation" of the data. If the keys of the passed dict
should be the columns of the resulting DataFrame, pass 'columns'
(default). Otherwise if the keys should be rows, pass 'index'.
dtype : dtype, default None
Data type to force, otherwise infer.
columns : list, default None
Column labels to use when ``orient='index'``. Raises a ValueError
if used with ``orient='columns'``.
.. versionadded:: 0.23.0
Returns
-------
pandas.DataFrame
See Also
--------
DataFrame.from_records : DataFrame from ndarray (structured
dtype), list of tuples, dict, or DataFrame
DataFrame : DataFrame object creation using constructor
Examples
--------
By default the keys of the dict become the DataFrame columns:
>>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
>>> pd.DataFrame.from_dict(data)
col_1 col_2
0 3 a
1 2 b
2 1 c
3 0 d
Specify ``orient='index'`` to create the DataFrame using dictionary
keys as rows:
>>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
>>> pd.DataFrame.from_dict(data, orient='index')
0 1 2 3
row_1 3 2 1 0
row_2 a b c d
When using the 'index' orientation, the column names can be
specified manually:
>>> pd.DataFrame.from_dict(data, orient='index',
... columns=['A', 'B', 'C', 'D'])
A B C D
row_1 3 2 1 0
row_2 a b c d
"""
index = None
orient = orient.lower()
if orient == 'index':
if len(data) > 0:
# TODO speed up Series case
if isinstance(list(data.values())[0], (Series, dict)):
data = _from_nested_dict(data)
else:
data, index = list(data.values()), list(data.keys())
elif orient == 'columns':
if columns is not None:
raise ValueError("cannot use columns parameter with "
"orient='columns'")
else: # pragma: no cover
raise ValueError('only recognize index or columns for orient')
return cls(data, index=index, columns=columns, dtype=dtype)
def to_dict(self, orient='dict', into=dict):
"""
Convert the DataFrame to a dictionary.
The type of the key-value pairs can be customized with the parameters
(see below).
Parameters
----------
orient : str {'dict', 'list', 'series', 'split', 'records', 'index'}
Determines the type of the values of the dictionary.
- 'dict' (default) : dict like {column -> {index -> value}}
- 'list' : dict like {column -> [values]}
- 'series' : dict like {column -> Series(values)}
- 'split' : dict like
{'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
- 'records' : list like
[{column -> value}, ... , {column -> value}]
- 'index' : dict like {index -> {column -> value}}
Abbreviations are allowed. `s` indicates `series` and `sp`
indicates `split`.
into : class, default dict
The collections.Mapping subclass used for all Mappings
in the return value. Can be the actual class or an empty
instance of the mapping type you want. If you want a
collections.defaultdict, you must pass it initialized.
.. versionadded:: 0.21.0
Returns
-------
result : collections.Mapping like {column -> {index -> value}}
See Also
--------
DataFrame.from_dict: create a DataFrame from a dictionary
DataFrame.to_json: convert a DataFrame to JSON format
Examples
--------
>>> df = pd.DataFrame({'col1': [1, 2],
... 'col2': [0.5, 0.75]},
... index=['a', 'b'])
>>> df
col1 col2
a 1 0.50
b 2 0.75
>>> df.to_dict()
{'col1': {'a': 1, 'b': 2}, 'col2': {'a': 0.5, 'b': 0.75}}
You can specify the return orientation.
>>> df.to_dict('series')
{'col1': a 1
b 2
Name: col1, dtype: int64,
'col2': a 0.50
b 0.75
Name: col2, dtype: float64}
>>> df.to_dict('split')
{'index': ['a', 'b'], 'columns': ['col1', 'col2'],
'data': [[1.0, 0.5], [2.0, 0.75]]}
>>> df.to_dict('records')
[{'col1': 1.0, 'col2': 0.5}, {'col1': 2.0, 'col2': 0.75}]
>>> df.to_dict('index')
{'a': {'col1': 1.0, 'col2': 0.5}, 'b': {'col1': 2.0, 'col2': 0.75}}
You can also specify the mapping type.
>>> from collections import OrderedDict, defaultdict
>>> df.to_dict(into=OrderedDict)
OrderedDict([('col1', OrderedDict([('a', 1), ('b', 2)])),
('col2', OrderedDict([('a', 0.5), ('b', 0.75)]))])
If you want a `defaultdict`, you need to initialize it:
>>> dd = defaultdict(list)
>>> df.to_dict('records', into=dd)
[defaultdict(<class 'list'>, {'col1': 1.0, 'col2': 0.5}),
defaultdict(<class 'list'>, {'col1': 2.0, 'col2': 0.75})]
"""
if not self.columns.is_unique:
warnings.warn("DataFrame columns are not unique, some "
"columns will be omitted.", UserWarning,
stacklevel=2)
# GH16122
into_c = com.standardize_mapping(into)
if orient.lower().startswith('d'):
return into_c(
(k, v.to_dict(into)) for k, v in compat.iteritems(self))
elif orient.lower().startswith('l'):
return into_c((k, v.tolist()) for k, v in compat.iteritems(self))
elif orient.lower().startswith('sp'):
return into_c((('index', self.index.tolist()),
('columns', self.columns.tolist()),
('data', lib.map_infer(self.values.ravel(),
com.maybe_box_datetimelike)
.reshape(self.values.shape).tolist())))
elif orient.lower().startswith('s'):
return into_c((k, com.maybe_box_datetimelike(v))
for k, v in compat.iteritems(self))
elif orient.lower().startswith('r'):
return [into_c((k, com.maybe_box_datetimelike(v))
for k, v in zip(self.columns, np.atleast_1d(row)))
for row in self.values]
elif orient.lower().startswith('i'):
return into_c((t[0], dict(zip(self.columns, t[1:])))
for t in self.itertuples())
else:
raise ValueError("orient '{o}' not understood".format(o=orient))
def to_gbq(self, destination_table, project_id=None, chunksize=None,
reauth=False, if_exists='fail', private_key=None,
auth_local_webserver=False, table_schema=None, location=None,
progress_bar=True, verbose=None):
"""
Write a DataFrame to a Google BigQuery table.
This function requires the `pandas-gbq package
<https://pandas-gbq.readthedocs.io>`__.
See the `How to authenticate with Google BigQuery
<https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
guide for authentication instructions.
Parameters
----------
destination_table : str
Name of table to be written, in the form ``dataset.tablename``.
project_id : str, optional
Google BigQuery Account project ID. Optional when available from
the environment.
chunksize : int, optional
Number of rows to be inserted in each chunk from the dataframe.
Set to ``None`` to load the whole dataframe at once.
reauth : bool, default False
Force Google BigQuery to re-authenticate the user. This is useful
if multiple accounts are used.
if_exists : str, default 'fail'
Behavior when the destination table exists. Value can be one of:
``'fail'``
If table exists, do nothing.
``'replace'``
If table exists, drop it, recreate it, and insert data.
``'append'``
If table exists, insert data. Create if does not exist.
private_key : str, optional
Service account private key in JSON format. Can be file path
or string contents. This is useful for remote server
authentication (eg. Jupyter/IPython notebook on remote host).
auth_local_webserver : bool, default False
Use the `local webserver flow`_ instead of the `console flow`_
when getting user credentials.
.. _local webserver flow:
http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
.. _console flow:
http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
*New in version 0.2.0 of pandas-gbq*.
table_schema : list of dicts, optional
List of BigQuery table fields to which according DataFrame
columns conform to, e.g. ``[{'name': 'col1', 'type':
'STRING'},...]``. If schema is not provided, it will be
generated according to dtypes of DataFrame columns. See
BigQuery API documentation on available names of a field.
*New in version 0.3.1 of pandas-gbq*.
location : str, optional
Location where the load job should run. See the `BigQuery locations
documentation
<https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
list of available locations. The location must match that of the
target dataset.
*New in version 0.5.0 of pandas-gbq*.
progress_bar : bool, default True
Use the library `tqdm` to show the progress bar for the upload,
chunk by chunk.
*New in version 0.5.0 of pandas-gbq*.
verbose : bool, deprecated
Deprecated in Pandas-GBQ 0.4.0. Use the `logging module
to adjust verbosity instead
<https://pandas-gbq.readthedocs.io/en/latest/intro.html#logging>`__.
See Also
--------
pandas_gbq.to_gbq : This function in the pandas-gbq library.
pandas.read_gbq : Read a DataFrame from Google BigQuery.
"""
from pandas.io import gbq
return gbq.to_gbq(
self, destination_table, project_id=project_id,
chunksize=chunksize, reauth=reauth,
if_exists=if_exists, private_key=private_key,
auth_local_webserver=auth_local_webserver,
table_schema=table_schema, location=location,
progress_bar=progress_bar, verbose=verbose)
@classmethod
def from_records(cls, data, index=None, exclude=None, columns=None,
coerce_float=False, nrows=None):
"""
Convert structured or record ndarray to DataFrame
Parameters
----------
data : ndarray (structured dtype), list of tuples, dict, or DataFrame
index : string, list of fields, array-like
Field of array to use as the index, alternately a specific set of
input labels to use
exclude : sequence, default None
Columns or fields to exclude
columns : sequence, default None
Column names to use. If the passed data do not have names
associated with them, this argument provides names for the
columns. Otherwise this argument indicates the order of the columns
in the result (any names not found in the data will become all-NA
columns)
coerce_float : boolean, default False
Attempt to convert values of non-string, non-numeric objects (like
decimal.Decimal) to floating point, useful for SQL result sets
nrows : int, default None
Number of rows to read if data is an iterator
Returns
-------
df : DataFrame
"""
# Make a copy of the input columns so we can modify it
if columns is not None:
columns = ensure_index(columns)
if is_iterator(data):
if nrows == 0:
return cls()
try:
first_row = next(data)
except StopIteration:
return cls(index=index, columns=columns)
dtype = None
if hasattr(first_row, 'dtype') and first_row.dtype.names:
dtype = first_row.dtype
values = [first_row]
if nrows is None:
values += data
else:
values.extend(itertools.islice(data, nrows - 1))
if dtype is not None:
data = np.array(values, dtype=dtype)
else:
data = values
if isinstance(data, dict):
if columns is None:
columns = arr_columns = ensure_index(sorted(data))
arrays = [data[k] for k in columns]
else:
arrays = []
arr_columns = []
for k, v in compat.iteritems(data):
if k in columns:
arr_columns.append(k)
arrays.append(v)
arrays, arr_columns = _reorder_arrays(arrays, arr_columns,
columns)
elif isinstance(data, (np.ndarray, DataFrame)):
arrays, columns = _to_arrays(data, columns)
if columns is not None:
columns = ensure_index(columns)
arr_columns = columns
else:
arrays, arr_columns = _to_arrays(data, columns,
coerce_float=coerce_float)
arr_columns = ensure_index(arr_columns)
if columns is not None:
columns = ensure_index(columns)
else:
columns = arr_columns
if exclude is None:
exclude = set()
else:
exclude = set(exclude)
result_index = None
if index is not None:
if (isinstance(index, compat.string_types) or
not hasattr(index, "__iter__")):
i = columns.get_loc(index)
exclude.add(index)
if len(arrays) > 0:
result_index = Index(arrays[i], name=index)
else:
result_index = Index([], name=index)
else:
try:
to_remove = [arr_columns.get_loc(field) for field in index]
index_data = [arrays[i] for i in to_remove]
result_index = ensure_index_from_sequences(index_data,
names=index)
exclude.update(index)
except Exception:
result_index = index
if any(exclude):
arr_exclude = [x for x in exclude if x in arr_columns]
to_remove = [arr_columns.get_loc(col) for col in arr_exclude]
arrays = [v for i, v in enumerate(arrays) if i not in to_remove]
arr_columns = arr_columns.drop(arr_exclude)
columns = columns.drop(exclude)
mgr = _arrays_to_mgr(arrays, arr_columns, result_index, columns)
return cls(mgr)
def to_records(self, index=True, convert_datetime64=None):
"""
Convert DataFrame to a NumPy record array.
Index will be included as the first field of the record array if
requested.
Parameters
----------
index : bool, default True
Include index in resulting record array, stored in 'index'
field or using the index label, if set.
convert_datetime64 : bool, default None
.. deprecated:: 0.23.0
Whether to convert the index to datetime.datetime if it is a
DatetimeIndex.
Returns
-------
numpy.recarray
NumPy ndarray with the DataFrame labels as fields and each row
of the DataFrame as entries.
See Also
--------
DataFrame.from_records: convert structured or record ndarray
to DataFrame.
numpy.recarray: ndarray that allows field access using
attributes, analogous to typed columns in a
spreadsheet.
Examples
--------
>>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},
... index=['a', 'b'])
>>> df
A B
a 1 0.50
b 2 0.75
>>> df.to_records()
rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
If the DataFrame index has no label then the recarray field name
is set to 'index'. If the index has a label then this is used as the
field name:
>>> df.index = df.index.rename("I")
>>> df.to_records()
rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')])
The index can be excluded from the record array:
>>> df.to_records(index=False)
rec.array([(1, 0.5 ), (2, 0.75)],
dtype=[('A', '<i8'), ('B', '<f8')])
"""
if convert_datetime64 is not None:
warnings.warn("The 'convert_datetime64' parameter is "
"deprecated and will be removed in a future "
"version",
FutureWarning, stacklevel=2)
if index:
if is_datetime64_any_dtype(self.index) and convert_datetime64:
ix_vals = [self.index.to_pydatetime()]
else:
if isinstance(self.index, MultiIndex):
# array of tuples to numpy cols. copy copy copy
ix_vals = lmap(np.array, zip(*self.index.values))
else:
ix_vals = [self.index.values]
arrays = ix_vals + [self[c].get_values() for c in self.columns]
count = 0
index_names = list(self.index.names)
if isinstance(self.index, MultiIndex):
for i, n in enumerate(index_names):
if n is None:
index_names[i] = 'level_%d' % count
count += 1
elif index_names[0] is None:
index_names = ['index']
names = (lmap(compat.text_type, index_names) +
lmap(compat.text_type, self.columns))
else:
arrays = [self[c].get_values() for c in self.columns]
names = lmap(compat.text_type, self.columns)
formats = [v.dtype for v in arrays]
return np.rec.fromarrays(
arrays,
dtype={'names': names, 'formats': formats}
)
@classmethod
def from_items(cls, items, columns=None, orient='columns'):
"""Construct a dataframe from a list of tuples
.. deprecated:: 0.23.0
`from_items` is deprecated and will be removed in a future version.
Use :meth:`DataFrame.from_dict(dict(items)) <DataFrame.from_dict>`
instead.
:meth:`DataFrame.from_dict(OrderedDict(items)) <DataFrame.from_dict>`
may be used to preserve the key order.
Convert (key, value) pairs to DataFrame. The keys will be the axis
index (usually the columns, but depends on the specified
orientation). The values should be arrays or Series.
Parameters
----------
items : sequence of (key, value) pairs
Values should be arrays or Series.
columns : sequence of column labels, optional
Must be passed if orient='index'.
orient : {'columns', 'index'}, default 'columns'
The "orientation" of the data. If the keys of the
input correspond to column labels, pass 'columns'
(default). Otherwise if the keys correspond to the index,
pass 'index'.
Returns
-------
frame : DataFrame
"""
warnings.warn("from_items is deprecated. Please use "
"DataFrame.from_dict(dict(items), ...) instead. "
"DataFrame.from_dict(OrderedDict(items)) may be used to "
"preserve the key order.",
FutureWarning, stacklevel=2)
keys, values = lzip(*items)
if orient == 'columns':
if columns is not None:
columns = ensure_index(columns)
idict = dict(items)
if len(idict) < len(items):
if not columns.equals(ensure_index(keys)):
raise ValueError('With non-unique item names, passed '
'columns must be identical')
arrays = values
else:
arrays = [idict[k] for k in columns if k in idict]
else:
columns = ensure_index(keys)
arrays = values
# GH 17312
# Provide more informative error msg when scalar values passed
try:
return cls._from_arrays(arrays, columns, None)
except ValueError:
if not is_nested_list_like(values):
raise ValueError('The value in each (key, value) pair '
'must be an array, Series, or dict')
elif orient == 'index':
if columns is None:
raise TypeError("Must pass columns with orient='index'")
keys = ensure_index(keys)
# GH 17312
# Provide more informative error msg when scalar values passed
try:
arr = np.array(values, dtype=object).T
data = [lib.maybe_convert_objects(v) for v in arr]
return cls._from_arrays(data, columns, keys)
except TypeError:
if not is_nested_list_like(values):
raise ValueError('The value in each (key, value) pair '
'must be an array, Series, or dict')
else: # pragma: no cover
raise ValueError("'orient' must be either 'columns' or 'index'")
@classmethod
def _from_arrays(cls, arrays, columns, index, dtype=None):
mgr = _arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
return cls(mgr)
@classmethod
def from_csv(cls, path, header=0, sep=',', index_col=0, parse_dates=True,
encoding=None, tupleize_cols=None,
infer_datetime_format=False):
"""Read CSV file.
.. deprecated:: 0.21.0
Use :func:`pandas.read_csv` instead.
It is preferable to use the more powerful :func:`pandas.read_csv`
for most general purposes, but ``from_csv`` makes for an easy
roundtrip to and from a file (the exact counterpart of
``to_csv``), especially with a DataFrame of time series data.
This method only differs from the preferred :func:`pandas.read_csv`
in some defaults:
- `index_col` is ``0`` instead of ``None`` (take first column as index
by default)
- `parse_dates` is ``True`` instead of ``False`` (try parsing the index
as datetime by default)
So a ``pd.DataFrame.from_csv(path)`` can be replaced by
``pd.read_csv(path, index_col=0, parse_dates=True)``.
Parameters
----------
path : string file path or file handle / StringIO
header : int, default 0
Row to use as header (skip prior rows)
sep : string, default ','
Field delimiter
index_col : int or sequence, default 0
Column to use for index. If a sequence is given, a MultiIndex
is used. Different default from read_table
parse_dates : boolean, default True
Parse dates. Different default from read_table
tupleize_cols : boolean, default False
write multi_index columns as a list of tuples (if True)
or new (expanded format) if False)
infer_datetime_format: boolean, default False
If True and `parse_dates` is True for a column, try to infer the
datetime format based on the first datetime string. If the format
can be inferred, there often will be a large parsing speed-up.
See also
--------
pandas.read_csv
Returns
-------
y : DataFrame
"""
warnings.warn("from_csv is deprecated. Please use read_csv(...) "
"instead. Note that some of the default arguments are "
"different, so please refer to the documentation "
"for from_csv when changing your function calls",
FutureWarning, stacklevel=2)
from pandas.io.parsers import read_csv
return read_csv(path, header=header, sep=sep,
parse_dates=parse_dates, index_col=index_col,
encoding=encoding, tupleize_cols=tupleize_cols,
infer_datetime_format=infer_datetime_format)
def to_sparse(self, fill_value=None, kind='block'):
"""
Convert to SparseDataFrame.
Implement the sparse version of the DataFrame meaning that any data
matching a specific value it's omitted in the representation.
The sparse DataFrame allows for a more efficient storage.
Parameters
----------
fill_value : float, default None
The specific value that should be omitted in the representation.
kind : {'block', 'integer'}, default 'block'
The kind of the SparseIndex tracking where data is not equal to
the fill value:
- 'block' tracks only the locations and sizes of blocks of data.
- 'integer' keeps an array with all the locations of the data.
In most cases 'block' is recommended, since it's more memory
efficient.
Returns
-------
SparseDataFrame
The sparse representation of the DataFrame.
See Also
--------
DataFrame.to_dense :
Converts the DataFrame back to the its dense form.
Examples
--------
>>> df = pd.DataFrame([(np.nan, np.nan),
... (1., np.nan),
... (np.nan, 1.)])
>>> df
0 1
0 NaN NaN
1 1.0 NaN
2 NaN 1.0
>>> type(df)
<class 'pandas.core.frame.DataFrame'>
>>> sdf = df.to_sparse()
>>> sdf
0 1
0 NaN NaN
1 1.0 NaN
2 NaN 1.0
>>> type(sdf)
<class 'pandas.core.sparse.frame.SparseDataFrame'>
"""
from pandas.core.sparse.frame import SparseDataFrame
return SparseDataFrame(self._series, index=self.index,
columns=self.columns, default_kind=kind,
default_fill_value=fill_value)
def to_panel(self):
"""
Transform long (stacked) format (DataFrame) into wide (3D, Panel)
format.
.. deprecated:: 0.20.0
Currently the index of the DataFrame must be a 2-level MultiIndex. This
may be generalized later
Returns
-------
panel : Panel
"""
# only support this kind for now
if (not isinstance(self.index, MultiIndex) or # pragma: no cover
len(self.index.levels) != 2):
raise NotImplementedError('Only 2-level MultiIndex are supported.')
if not self.index.is_unique:
raise ValueError("Can't convert non-uniquely indexed "
"DataFrame to Panel")
self._consolidate_inplace()
# minor axis must be sorted
if self.index.lexsort_depth < 2:
selfsorted = self.sort_index(level=0)
else:
selfsorted = self
major_axis, minor_axis = selfsorted.index.levels
major_labels, minor_labels = selfsorted.index.labels
shape = len(major_axis), len(minor_axis)
# preserve names, if any
major_axis = major_axis.copy()
major_axis.name = self.index.names[0]
minor_axis = minor_axis.copy()
minor_axis.name = self.index.names[1]
# create new axes
new_axes = [selfsorted.columns, major_axis, minor_axis]
# create new manager
new_mgr = selfsorted._data.reshape_nd(axes=new_axes,
labels=[major_labels,
minor_labels],
shape=shape,
ref_items=selfsorted.columns)
return self._constructor_expanddim(new_mgr)
@Appender(_shared_docs['to_excel'] % _shared_doc_kwargs)
def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
float_format=None, columns=None, header=True, index=True,
index_label=None, startrow=0, startcol=0, engine=None,
merge_cells=True, encoding=None, inf_rep='inf', verbose=True,
freeze_panes=None):
from pandas.io.formats.excel import ExcelFormatter
formatter = ExcelFormatter(self, na_rep=na_rep, cols=columns,
header=header,
float_format=float_format, index=index,
index_label=index_label,
merge_cells=merge_cells,
inf_rep=inf_rep)
formatter.write(excel_writer, sheet_name=sheet_name, startrow=startrow,
startcol=startcol, freeze_panes=freeze_panes,
engine=engine)
@deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
def to_stata(self, fname, convert_dates=None, write_index=True,
encoding="latin-1", byteorder=None, time_stamp=None,
data_label=None, variable_labels=None, version=114,
convert_strl=None):
"""
Export Stata binary dta files.
Parameters
----------
fname : path (string), buffer or path object
string, path object (pathlib.Path or py._path.local.LocalPath) or
object implementing a binary write() functions. If using a buffer
then the buffer will not be automatically closed after the file
data has been written.
convert_dates : dict
Dictionary mapping columns containing datetime types to stata
internal format to use when writing the dates. Options are 'tc',
'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer
or a name. Datetime columns that do not have a conversion type
specified will be converted to 'tc'. Raises NotImplementedError if
a datetime column has timezone information.
write_index : bool
Write the index to Stata dataset.
encoding : str
Default is latin-1. Unicode is not supported.
byteorder : str
Can be ">", "<", "little", or "big". default is `sys.byteorder`.
time_stamp : datetime
A datetime to use as file creation date. Default is the current
time.
data_label : str
A label for the data set. Must be 80 characters or smaller.
variable_labels : dict
Dictionary containing columns as keys and variable labels as
values. Each label must be 80 characters or smaller.
.. versionadded:: 0.19.0
version : {114, 117}
Version to use in the output dta file. Version 114 can be used
read by Stata 10 and later. Version 117 can be read by Stata 13
or later. Version 114 limits string variables to 244 characters or
fewer while 117 allows strings with lengths up to 2,000,000
characters.
.. versionadded:: 0.23.0
convert_strl : list, optional
List of column names to convert to string columns to Stata StrL
format. Only available if version is 117. Storing strings in the
StrL format can produce smaller dta files if strings have more than
8 characters and values are repeated.
.. versionadded:: 0.23.0
Raises
------
NotImplementedError
* If datetimes contain timezone information
* Column dtype is not representable in Stata
ValueError
* Columns listed in convert_dates are neither datetime64[ns]
or datetime.datetime
* Column listed in convert_dates is not in DataFrame
* Categorical label contains more than 32,000 characters
.. versionadded:: 0.19.0
See Also
--------
pandas.read_stata : Import Stata data files
pandas.io.stata.StataWriter : low-level writer for Stata data files
pandas.io.stata.StataWriter117 : low-level writer for version 117 files
Examples
--------
>>> data.to_stata('./data_file.dta')
Or with dates
>>> data.to_stata('./date_data_file.dta', {2 : 'tw'})
Alternatively you can create an instance of the StataWriter class
>>> writer = StataWriter('./data_file.dta', data)
>>> writer.write_file()
With dates:
>>> writer = StataWriter('./date_data_file.dta', data, {2 : 'tw'})
>>> writer.write_file()
"""
kwargs = {}
if version not in (114, 117):
raise ValueError('Only formats 114 and 117 supported.')
if version == 114:
if convert_strl is not None:
raise ValueError('strl support is only available when using '
'format 117')
from pandas.io.stata import StataWriter as statawriter
else:
from pandas.io.stata import StataWriter117 as statawriter
kwargs['convert_strl'] = convert_strl
writer = statawriter(fname, self, convert_dates=convert_dates,
byteorder=byteorder, time_stamp=time_stamp,
data_label=data_label, write_index=write_index,
variable_labels=variable_labels, **kwargs)
writer.write_file()
def to_feather(self, fname):
"""
write out the binary feather-format for DataFrames
.. versionadded:: 0.20.0
Parameters
----------
fname : str
string file path
"""
from pandas.io.feather_format import to_feather
to_feather(self, fname)
def to_parquet(self, fname, engine='auto', compression='snappy',
index=None, **kwargs):
"""
Write a DataFrame to the binary parquet format.
.. versionadded:: 0.21.0
This function writes the dataframe as a `parquet file
<https://parquet.apache.org/>`_. You can choose different parquet
backends, and have the option of compression. See
:ref:`the user guide <io.parquet>` for more details.
Parameters
----------
fname : str
String file path.
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
Parquet library to use. If 'auto', then the option
``io.parquet.engine`` is used. The default ``io.parquet.engine``
behavior is to try 'pyarrow', falling back to 'fastparquet' if
'pyarrow' is unavailable.
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
Name of the compression to use. Use ``None`` for no compression.
index : bool, default None
If ``True``, include the dataframe's index(es) in the file output.
If ``False``, they will not be written to the file. If ``None``,
the behavior depends on the chosen engine.
.. versionadded:: 0.24.0
**kwargs
Additional arguments passed to the parquet library. See
:ref:`pandas io <io.parquet>` for more details.
See Also
--------
read_parquet : Read a parquet file.
DataFrame.to_csv : Write a csv file.
DataFrame.to_sql : Write to a sql table.
DataFrame.to_hdf : Write to hdf.
Notes
-----
This function requires either the `fastparquet
<https://pypi.org/project/fastparquet>`_ or `pyarrow
<https://arrow.apache.org/docs/python/>`_ library.
Examples
--------
>>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})
>>> df.to_parquet('df.parquet.gzip', compression='gzip')
>>> pd.read_parquet('df.parquet.gzip')
col1 col2
0 1 3
1 2 4
"""
from pandas.io.parquet import to_parquet
to_parquet(self, fname, engine,
compression=compression, index=index, **kwargs)
@Substitution(header='Write out the column names. If a list of strings '
'is given, it is assumed to be aliases for the '
'column names')
@Substitution(shared_params=fmt.common_docstring,
returns=fmt.return_docstring)
def to_string(self, buf=None, columns=None, col_space=None, header=True,
index=True, na_rep='NaN', formatters=None, float_format=None,
sparsify=None, index_names=True, justify=None,
line_width=None, max_rows=None, max_cols=None,
show_dimensions=False):
"""
Render a DataFrame to a console-friendly tabular output.
%(shared_params)s
line_width : int, optional
Width to wrap a line in characters.
%(returns)s
See Also
--------
to_html : Convert DataFrame to HTML.
Examples
--------
>>> d = {'col1' : [1, 2, 3], 'col2' : [4, 5, 6]}
>>> df = pd.DataFrame(d)
>>> print(df.to_string())
col1 col2
0 1 4
1 2 5
2 3 6
"""
formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns,
col_space=col_space, na_rep=na_rep,
formatters=formatters,
float_format=float_format,
sparsify=sparsify, justify=justify,
index_names=index_names,
header=header, index=index,
line_width=line_width,
max_rows=max_rows,
max_cols=max_cols,
show_dimensions=show_dimensions)
formatter.to_string()
if buf is None:
result = formatter.buf.getvalue()
return result
@Substitution(header='whether to print column labels, default True')
@Substitution(shared_params=fmt.common_docstring,
returns=fmt.return_docstring)
def to_html(self, buf=None, columns=None, col_space=None, header=True,
index=True, na_rep='NaN', formatters=None, float_format=None,
sparsify=None, index_names=True, justify=None, bold_rows=True,
classes=None, escape=True, max_rows=None, max_cols=None,
show_dimensions=False, notebook=False, decimal='.',
border=None, table_id=None):
"""
Render a DataFrame as an HTML table.
%(shared_params)s
bold_rows : boolean, default True
Make the row labels bold in the output
classes : str or list or tuple, default None
CSS class(es) to apply to the resulting html table
escape : boolean, default True
Convert the characters <, >, and & to HTML-safe sequences.
notebook : {True, False}, default False
Whether the generated HTML is for IPython Notebook.
decimal : string, default '.'
Character recognized as decimal separator, e.g. ',' in Europe
.. versionadded:: 0.18.0
border : int
A ``border=border`` attribute is included in the opening
`<table>` tag. Default ``pd.options.html.border``.
.. versionadded:: 0.19.0
table_id : str, optional
A css id is included in the opening `<table>` tag if specified.
.. versionadded:: 0.23.0
%(returns)s
See Also
--------
to_string : Convert DataFrame to a string.
"""
if (justify is not None and
justify not in fmt._VALID_JUSTIFY_PARAMETERS):
raise ValueError("Invalid value for justify parameter")
formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns,
col_space=col_space, na_rep=na_rep,
formatters=formatters,
float_format=float_format,
sparsify=sparsify, justify=justify,
index_names=index_names,
header=header, index=index,
bold_rows=bold_rows, escape=escape,
max_rows=max_rows,
max_cols=max_cols,
show_dimensions=show_dimensions,
decimal=decimal, table_id=table_id)
# TODO: a generic formatter wld b in DataFrameFormatter
formatter.to_html(classes=classes, notebook=notebook, border=border)
if buf is None:
return formatter.buf.getvalue()
def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None,
null_counts=None):
"""
Print a concise summary of a DataFrame.
This method prints information about a DataFrame including
the index dtype and column dtypes, non-null values and memory usage.
Parameters
----------
verbose : bool, optional
Whether to print the full summary. By default, the setting in
``pandas.options.display.max_info_columns`` is followed.
buf : writable buffer, defaults to sys.stdout
Where to send the output. By default, the output is printed to
sys.stdout. Pass a writable buffer if you need to further process
the output.
max_cols : int, optional
When to switch from the verbose to the truncated output. If the
DataFrame has more than `max_cols` columns, the truncated output
is used. By default, the setting in
``pandas.options.display.max_info_columns`` is used.
memory_usage : bool, str, optional
Specifies whether total memory usage of the DataFrame
elements (including the index) should be displayed. By default,
this follows the ``pandas.options.display.memory_usage`` setting.
True always show memory usage. False never shows memory usage.
A value of 'deep' is equivalent to "True with deep introspection".
Memory usage is shown in human-readable units (base-2
representation). Without deep introspection a memory estimation is
made based in column dtype and number of rows assuming values
consume the same memory amount for corresponding dtypes. With deep
memory introspection, a real memory usage calculation is performed
at the cost of computational resources.
null_counts : bool, optional
Whether to show the non-null counts. By default, this is shown
only if the frame is smaller than
``pandas.options.display.max_info_rows`` and
``pandas.options.display.max_info_columns``. A value of True always
shows the counts, and False never shows the counts.
Returns
-------
None
This method prints a summary of a DataFrame and returns None.
See Also
--------
DataFrame.describe: Generate descriptive statistics of DataFrame
columns.
DataFrame.memory_usage: Memory usage of DataFrame columns.
Examples
--------
>>> int_values = [1, 2, 3, 4, 5]
>>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
>>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
>>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
... "float_col": float_values})
>>> df
int_col text_col float_col
0 1 alpha 0.00
1 2 beta 0.25
2 3 gamma 0.50
3 4 delta 0.75
4 5 epsilon 1.00
Prints information of all columns:
>>> df.info(verbose=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
int_col 5 non-null int64
text_col 5 non-null object
float_col 5 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 200.0+ bytes
Prints a summary of columns count and its dtypes but not per column
information:
>>> df.info(verbose=False)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Columns: 3 entries, int_col to float_col
dtypes: float64(1), int64(1), object(1)
memory usage: 200.0+ bytes
Pipe output of DataFrame.info to buffer instead of sys.stdout, get
buffer content and writes to a text file:
>>> import io
>>> buffer = io.StringIO()
>>> df.info(buf=buffer)
>>> s = buffer.getvalue()
>>> with open("df_info.txt", "w", encoding="utf-8") as f:
... f.write(s)
260
The `memory_usage` parameter allows deep introspection mode, specially
useful for big DataFrames and fine-tune memory optimization:
>>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
>>> df = pd.DataFrame({
... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
... })
>>> df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
column_1 1000000 non-null object
column_2 1000000 non-null object
column_3 1000000 non-null object
dtypes: object(3)
memory usage: 22.9+ MB
>>> df.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
column_1 1000000 non-null object
column_2 1000000 non-null object
column_3 1000000 non-null object
dtypes: object(3)
memory usage: 188.8 MB
"""
if buf is None: # pragma: no cover
buf = sys.stdout
lines = []
lines.append(str(type(self)))
lines.append(self.index._summary())
if len(self.columns) == 0:
lines.append('Empty {name}'.format(name=type(self).__name__))
fmt.buffer_put_lines(buf, lines)
return
cols = self.columns
# hack
if max_cols is None:
max_cols = get_option('display.max_info_columns',
len(self.columns) + 1)
max_rows = get_option('display.max_info_rows', len(self) + 1)
if null_counts is None:
show_counts = ((len(self.columns) <= max_cols) and
(len(self) < max_rows))
else:
show_counts = null_counts
exceeds_info_cols = len(self.columns) > max_cols
def _verbose_repr():
lines.append('Data columns (total %d columns):' %
len(self.columns))
space = max(len(pprint_thing(k)) for k in self.columns) + 4
counts = None
tmpl = "{count}{dtype}"
if show_counts:
counts = self.count()
if len(cols) != len(counts): # pragma: no cover
raise AssertionError(
'Columns must equal counts '
'({cols:d} != {counts:d})'.format(
cols=len(cols), counts=len(counts)))
tmpl = "{count} non-null {dtype}"
dtypes = self.dtypes
for i, col in enumerate(self.columns):
dtype = dtypes.iloc[i]
col = pprint_thing(col)
count = ""
if show_counts:
count = counts.iloc[i]
lines.append(_put_str(col, space) + tmpl.format(count=count,
dtype=dtype))
def _non_verbose_repr():
lines.append(self.columns._summary(name='Columns'))
def _sizeof_fmt(num, size_qualifier):
# returns size in human readable format
for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
if num < 1024.0:
return ("{num:3.1f}{size_q} "
"{x}".format(num=num, size_q=size_qualifier, x=x))
num /= 1024.0
return "{num:3.1f}{size_q} {pb}".format(num=num,
size_q=size_qualifier,
pb='PB')
if verbose:
_verbose_repr()
elif verbose is False: # specifically set to False, not nesc None
_non_verbose_repr()
else:
if exceeds_info_cols:
_non_verbose_repr()
else:
_verbose_repr()
counts = self.get_dtype_counts()
dtypes = ['{k}({kk:d})'.format(k=k[0], kk=k[1]) for k
in sorted(compat.iteritems(counts))]
lines.append('dtypes: {types}'.format(types=', '.join(dtypes)))
if memory_usage is None:
memory_usage = get_option('display.memory_usage')
if memory_usage:
# append memory usage of df to display
size_qualifier = ''
if memory_usage == 'deep':
deep = True
else:
# size_qualifier is just a best effort; not guaranteed to catch
# all cases (e.g., it misses categorical data even with object
# categories)
deep = False
if ('object' in counts or
self.index._is_memory_usage_qualified()):
size_qualifier = '+'
mem_usage = self.memory_usage(index=True, deep=deep).sum()
lines.append("memory usage: {mem}\n".format(
mem=_sizeof_fmt(mem_usage, size_qualifier)))
fmt.buffer_put_lines(buf, lines)
def memory_usage(self, index=True, deep=False):
"""
Return the memory usage of each column in bytes.
The memory usage can optionally include the contribution of
the index and elements of `object` dtype.
This value is displayed in `DataFrame.info` by default. This can be
suppressed by setting ``pandas.options.display.memory_usage`` to False.
Parameters
----------
index : bool, default True
Specifies whether to include the memory usage of the DataFrame's
index in returned Series. If ``index=True`` the memory usage of the
index the first item in the output.
deep : bool, default False
If True, introspect the data deeply by interrogating
`object` dtypes for system-level memory consumption, and include
it in the returned values.
Returns
-------
sizes : Series
A Series whose index is the original column names and whose values
is the memory usage of each column in bytes.
See Also
--------
numpy.ndarray.nbytes : Total bytes consumed by the elements of an
ndarray.
Series.memory_usage : Bytes consumed by a Series.
pandas.Categorical : Memory-efficient array for string values with
many repeated values.
DataFrame.info : Concise summary of a DataFrame.
Examples
--------
>>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']
>>> data = dict([(t, np.ones(shape=5000).astype(t))
... for t in dtypes])
>>> df = pd.DataFrame(data)
>>> df.head()
int64 float64 complex128 object bool
0 1 1.0 (1+0j) 1 True
1 1 1.0 (1+0j) 1 True
2 1 1.0 (1+0j) 1 True
3 1 1.0 (1+0j) 1 True
4 1 1.0 (1+0j) 1 True
>>> df.memory_usage()
Index 80
int64 40000
float64 40000
complex128 80000
object 40000
bool 5000
dtype: int64
>>> df.memory_usage(index=False)
int64 40000
float64 40000
complex128 80000
object 40000
bool 5000
dtype: int64
The memory footprint of `object` dtype columns is ignored by default:
>>> df.memory_usage(deep=True)
Index 80
int64 40000
float64 40000
complex128 80000
object 160000
bool 5000
dtype: int64
Use a Categorical for efficient storage of an object-dtype column with
many repeated values.
>>> df['object'].astype('category').memory_usage(deep=True)
5168
"""
result = Series([c.memory_usage(index=False, deep=deep)
for col, c in self.iteritems()], index=self.columns)
if index:
result = Series(self.index.memory_usage(deep=deep),
index=['Index']).append(result)
return result
def transpose(self, *args, **kwargs):
"""
Transpose index and columns.
Reflect the DataFrame over its main diagonal by writing rows as columns
and vice-versa. The property :attr:`.T` is an accessor to the method
:meth:`transpose`.
Parameters
----------
copy : bool, default False
If True, the underlying data is copied. Otherwise (default), no
copy is made if possible.
*args, **kwargs
Additional keywords have no effect but might be accepted for
compatibility with numpy.
Returns
-------
DataFrame
The transposed DataFrame.
See Also
--------
numpy.transpose : Permute the dimensions of a given array.
Notes
-----
Transposing a DataFrame with mixed dtypes will result in a homogeneous
DataFrame with the `object` dtype. In such a case, a copy of the data
is always made.
Examples
--------
**Square DataFrame with homogeneous dtype**
>>> d1 = {'col1': [1, 2], 'col2': [3, 4]}
>>> df1 = pd.DataFrame(data=d1)
>>> df1
col1 col2
0 1 3
1 2 4
>>> df1_transposed = df1.T # or df1.transpose()
>>> df1_transposed
0 1
col1 1 2
col2 3 4
When the dtype is homogeneous in the original DataFrame, we get a
transposed DataFrame with the same dtype:
>>> df1.dtypes
col1 int64
col2 int64
dtype: object
>>> df1_transposed.dtypes
0 int64
1 int64
dtype: object
**Non-square DataFrame with mixed dtypes**
>>> d2 = {'name': ['Alice', 'Bob'],
... 'score': [9.5, 8],
... 'employed': [False, True],
... 'kids': [0, 0]}
>>> df2 = pd.DataFrame(data=d2)
>>> df2
name score employed kids
0 Alice 9.5 False 0
1 Bob 8.0 True 0
>>> df2_transposed = df2.T # or df2.transpose()
>>> df2_transposed
0 1
name Alice Bob
score 9.5 8
employed False True
kids 0 0
When the DataFrame has mixed dtypes, we get a transposed DataFrame with
the `object` dtype:
>>> df2.dtypes
name object
score float64
employed bool
kids int64
dtype: object
>>> df2_transposed.dtypes
0 object
1 object
dtype: object
"""
nv.validate_transpose(args, dict())
return super(DataFrame, self).transpose(1, 0, **kwargs)
T = property(transpose)
# ----------------------------------------------------------------------
# Picklability
# legacy pickle formats
def _unpickle_frame_compat(self, state): # pragma: no cover
if len(state) == 2: # pragma: no cover
series, idx = state
columns = sorted(series)
else:
series, cols, idx = state
columns = com._unpickle_array(cols)
index = com._unpickle_array(idx)
self._data = self._init_dict(series, index, columns, None)
def _unpickle_matrix_compat(self, state): # pragma: no cover
# old unpickling
(vals, idx, cols), object_state = state
index = com._unpickle_array(idx)
dm = DataFrame(vals, index=index, columns=com._unpickle_array(cols),
copy=False)
if object_state is not None:
ovals, _, ocols = object_state
objects = DataFrame(ovals, index=index,
columns=com._unpickle_array(ocols), copy=False)
dm = dm.join(objects)
self._data = dm._data
# ----------------------------------------------------------------------
# Getting and setting elements
def get_value(self, index, col, takeable=False):
"""Quickly retrieve single value at passed column and index
.. deprecated:: 0.21.0
Use .at[] or .iat[] accessors instead.
Parameters
----------
index : row label
col : column label
takeable : interpret the index/col as indexers, default False
Returns
-------
value : scalar value
"""
warnings.warn("get_value is deprecated and will be removed "
"in a future release. Please use "
".at[] or .iat[] accessors instead", FutureWarning,
stacklevel=2)
return self._get_value(index, col, takeable=takeable)
def _get_value(self, index, col, takeable=False):
if takeable:
series = self._iget_item_cache(col)
return com.maybe_box_datetimelike(series._values[index])
series = self._get_item_cache(col)
engine = self.index._engine
try:
return engine.get_value(series._values, index)
except (TypeError, ValueError):
# we cannot handle direct indexing
# use positional
col = self.columns.get_loc(col)
index = self.index.get_loc(index)
return self._get_value(index, col, takeable=True)
_get_value.__doc__ = get_value.__doc__
def set_value(self, index, col, value, takeable=False):
"""Put single value at passed column and index
.. deprecated:: 0.21.0
Use .at[] or .iat[] accessors instead.
Parameters
----------
index : row label
col : column label
value : scalar value
takeable : interpret the index/col as indexers, default False
Returns
-------
frame : DataFrame
If label pair is contained, will be reference to calling DataFrame,
otherwise a new object
"""
warnings.warn("set_value is deprecated and will be removed "
"in a future release. Please use "
".at[] or .iat[] accessors instead", FutureWarning,
stacklevel=2)
return self._set_value(index, col, value, takeable=takeable)
def _set_value(self, index, col, value, takeable=False):
try:
if takeable is True:
series = self._iget_item_cache(col)
return series._set_value(index, value, takeable=True)
series = self._get_item_cache(col)
engine = self.index._engine
engine.set_value(series._values, index, value)
return self
except (KeyError, TypeError):
# set using a non-recursive method & reset the cache
self.loc[index, col] = value
self._item_cache.pop(col, None)
return self
_set_value.__doc__ = set_value.__doc__
def _ixs(self, i, axis=0):
"""
i : int, slice, or sequence of integers
axis : int
"""
# irow
if axis == 0:
"""
Notes
-----
If slice passed, the resulting data will be a view
"""
if isinstance(i, slice):
return self[i]
else:
label = self.index[i]
if isinstance(label, Index):
# a location index by definition
result = self.take(i, axis=axis)
copy = True
else:
new_values = self._data.fast_xs(i)
if is_scalar(new_values):
return new_values
# if we are a copy, mark as such
copy = (isinstance(new_values, np.ndarray) and
new_values.base is None)
result = self._constructor_sliced(new_values,
index=self.columns,
name=self.index[i],
dtype=new_values.dtype)
result._set_is_copy(self, copy=copy)
return result
# icol
else:
"""
Notes
-----
If slice passed, the resulting data will be a view
"""
label = self.columns[i]
if isinstance(i, slice):
# need to return view
lab_slice = slice(label[0], label[-1])
return self.loc[:, lab_slice]
else:
if isinstance(label, Index):
return self._take(i, axis=1)
index_len = len(self.index)
# if the values returned are not the same length
# as the index (iow a not found value), iget returns
# a 0-len ndarray. This is effectively catching
# a numpy error (as numpy should really raise)
values = self._data.iget(i)
if index_len and not len(values):
values = np.array([np.nan] * index_len, dtype=object)
result = self._box_col_values(values, label)
# this is a cached value, mark it so
result._set_as_cached(label, self)
return result
def __getitem__(self, key):
key = com.apply_if_callable(key, self)
# shortcut if the key is in columns
try:
if self.columns.is_unique and key in self.columns:
if self.columns.nlevels > 1:
return self._getitem_multilevel(key)
return self._get_item_cache(key)
except (TypeError, ValueError):
# The TypeError correctly catches non hashable "key" (e.g. list)
# The ValueError can be removed once GH #21729 is fixed
pass
# Do we have a slicer (on rows)?
indexer = convert_to_index_sliceable(self, key)
if indexer is not None:
return self._slice(indexer, axis=0)
# Do we have a (boolean) DataFrame?
if isinstance(key, DataFrame):
return self._getitem_frame(key)
# Do we have a (boolean) 1d indexer?
if com.is_bool_indexer(key):
return self._getitem_bool_array(key)
# We are left with two options: a single key, and a collection of keys,
# We interpret tuples as collections only for non-MultiIndex
is_single_key = isinstance(key, tuple) or not is_list_like(key)
if is_single_key:
if self.columns.nlevels > 1:
return self._getitem_multilevel(key)
indexer = self.columns.get_loc(key)
if is_integer(indexer):
indexer = [indexer]
else:
if is_iterator(key):
key = list(key)
indexer = self.loc._convert_to_indexer(key, axis=1,
raise_missing=True)
# take() does not accept boolean indexers
if getattr(indexer, "dtype", None) == bool:
indexer = np.where(indexer)[0]
data = self._take(indexer, axis=1)
if is_single_key:
# What does looking for a single key in a non-unique index return?
# The behavior is inconsistent. It returns a Series, except when
# - the key itself is repeated (test on data.shape, #9519), or
# - we have a MultiIndex on columns (test on self.columns, #21309)
if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex):
data = data[key]
return data
def _getitem_bool_array(self, key):
# also raises Exception if object array with NA values
# warning here just in case -- previously __setitem__ was
# reindexing but __getitem__ was not; it seems more reasonable to
# go with the __setitem__ behavior since that is more consistent
# with all other indexing behavior
if isinstance(key, Series) and not key.index.equals(self.index):
warnings.warn("Boolean Series key will be reindexed to match "
"DataFrame index.", UserWarning, stacklevel=3)
elif len(key) != len(self.index):
raise ValueError('Item wrong length %d instead of %d.' %
(len(key), len(self.index)))
# check_bool_indexer will throw exception if Series key cannot
# be reindexed to match DataFrame rows
key = check_bool_indexer(self.index, key)
indexer = key.nonzero()[0]
return self._take(indexer, axis=0)
def _getitem_multilevel(self, key):
loc = self.columns.get_loc(key)
if isinstance(loc, (slice, Series, np.ndarray, Index)):
new_columns = self.columns[loc]
result_columns = maybe_droplevels(new_columns, key)
if self._is_mixed_type:
result = self.reindex(columns=new_columns)
result.columns = result_columns
else:
new_values = self.values[:, loc]
result = self._constructor(new_values, index=self.index,
columns=result_columns)
result = result.__finalize__(self)
# If there is only one column being returned, and its name is
# either an empty string, or a tuple with an empty string as its
# first element, then treat the empty string as a placeholder
# and return the column as if the user had provided that empty
# string in the key. If the result is a Series, exclude the
# implied empty string from its name.
if len(result.columns) == 1:
top = result.columns[0]
if isinstance(top, tuple):
top = top[0]
if top == '':
result = result['']
if isinstance(result, Series):
result = self._constructor_sliced(result,
index=self.index,
name=key)
result._set_is_copy(self)
return result
else:
return self._get_item_cache(key)
def _getitem_frame(self, key):
if key.values.size and not is_bool_dtype(key.values):
raise ValueError('Must pass DataFrame with boolean values only')
return self.where(key)
def query(self, expr, inplace=False, **kwargs):
"""Query the columns of a frame with a boolean expression.
Parameters
----------
expr : string
The query string to evaluate. You can refer to variables
in the environment by prefixing them with an '@' character like
``@a + b``.
inplace : bool
Whether the query should modify the data in place or return
a modified copy
.. versionadded:: 0.18.0
kwargs : dict
See the documentation for :func:`pandas.eval` for complete details
on the keyword arguments accepted by :meth:`DataFrame.query`.
Returns
-------
q : DataFrame
Notes
-----
The result of the evaluation of this expression is first passed to
:attr:`DataFrame.loc` and if that fails because of a
multidimensional key (e.g., a DataFrame) then the result will be passed
to :meth:`DataFrame.__getitem__`.
This method uses the top-level :func:`pandas.eval` function to
evaluate the passed query.
The :meth:`~pandas.DataFrame.query` method uses a slightly
modified Python syntax by default. For example, the ``&`` and ``|``
(bitwise) operators have the precedence of their boolean cousins,
:keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,
however the semantics are different.
You can change the semantics of the expression by passing the keyword
argument ``parser='python'``. This enforces the same semantics as
evaluation in Python space. Likewise, you can pass ``engine='python'``
to evaluate an expression using Python itself as a backend. This is not
recommended as it is inefficient compared to using ``numexpr`` as the
engine.
The :attr:`DataFrame.index` and
:attr:`DataFrame.columns` attributes of the
:class:`~pandas.DataFrame` instance are placed in the query namespace
by default, which allows you to treat both the index and columns of the
frame as a column in the frame.
The identifier ``index`` is used for the frame index; you can also
use the name of the index to identify it in a query. Please note that
Python keywords may not be used as identifiers.
For further details and examples see the ``query`` documentation in
:ref:`indexing <indexing.query>`.
See Also
--------
pandas.eval
DataFrame.eval
Examples
--------
>>> df = pd.DataFrame(np.random.randn(10, 2), columns=list('ab'))
>>> df.query('a > b')
>>> df[df.a > df.b] # same result as the previous expression
"""
inplace = validate_bool_kwarg(inplace, 'inplace')
if not isinstance(expr, compat.string_types):
msg = "expr must be a string to be evaluated, {0} given"
raise ValueError(msg.format(type(expr)))
kwargs['level'] = kwargs.pop('level', 0) + 1
kwargs['target'] = None
res = self.eval(expr, **kwargs)
try:
new_data = self.loc[res]
except ValueError:
# when res is multi-dimensional loc raises, but this is sometimes a
# valid query
new_data = self[res]
if inplace:
self._update_inplace(new_data)
else:
return new_data
def eval(self, expr, inplace=False, **kwargs):
"""
Evaluate a string describing operations on DataFrame columns.
Operates on columns only, not specific rows or elements. This allows
`eval` to run arbitrary code, which can make you vulnerable to code
injection if you pass user input to this function.
Parameters
----------
expr : str
The expression string to evaluate.
inplace : bool, default False
If the expression contains an assignment, whether to perform the
operation inplace and mutate the existing DataFrame. Otherwise,
a new DataFrame is returned.
.. versionadded:: 0.18.0.
kwargs : dict
See the documentation for :func:`~pandas.eval` for complete details
on the keyword arguments accepted by
:meth:`~pandas.DataFrame.query`.
Returns
-------
ndarray, scalar, or pandas object
The result of the evaluation.
See Also
--------
DataFrame.query : Evaluates a boolean expression to query the columns
of a frame.
DataFrame.assign : Can evaluate an expression or function to create new
values for a column.
pandas.eval : Evaluate a Python expression as a string using various
backends.
Notes
-----
For more details see the API documentation for :func:`~pandas.eval`.
For detailed examples see :ref:`enhancing performance with eval
<enhancingperf.eval>`.
Examples
--------
>>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
>>> df
A B
0 1 10
1 2 8
2 3 6
3 4 4
4 5 2
>>> df.eval('A + B')
0 11
1 10
2 9
3 8
4 7
dtype: int64
Assignment is allowed though by default the original DataFrame is not
modified.
>>> df.eval('C = A + B')
A B C
0 1 10 11
1 2 8 10
2 3 6 9
3 4 4 8
4 5 2 7
>>> df
A B
0 1 10
1 2 8
2 3 6
3 4 4
4 5 2
Use ``inplace=True`` to modify the original DataFrame.
>>> df.eval('C = A + B', inplace=True)
>>> df
A B C
0 1 10 11
1 2 8 10
2 3 6 9
3 4 4 8
4 5 2 7
"""
from pandas.core.computation.eval import eval as _eval
inplace = validate_bool_kwarg(inplace, 'inplace')
resolvers = kwargs.pop('resolvers', None)
kwargs['level'] = kwargs.pop('level', 0) + 1
if resolvers is None:
index_resolvers = self._get_index_resolvers()
resolvers = dict(self.iteritems()), index_resolvers
if 'target' not in kwargs:
kwargs['target'] = self
kwargs['resolvers'] = kwargs.get('resolvers', ()) + tuple(resolvers)
return _eval(expr, inplace=inplace, **kwargs)
def select_dtypes(self, include=None, exclude=None):
"""
Return a subset of the DataFrame's columns based on the column dtypes.
Parameters
----------
include, exclude : scalar or list-like
A selection of dtypes or strings to be included/excluded. At least
one of these parameters must be supplied.
Raises
------
ValueError
* If both of ``include`` and ``exclude`` are empty
* If ``include`` and ``exclude`` have overlapping elements
* If any kind of string dtype is passed in.
Returns
-------
subset : DataFrame
The subset of the frame including the dtypes in ``include`` and
excluding the dtypes in ``exclude``.
Notes
-----
* To select all *numeric* types, use ``np.number`` or ``'number'``
* To select strings you must use the ``object`` dtype, but note that
this will return *all* object dtype columns
* See the `numpy dtype hierarchy
<http://docs.scipy.org/doc/numpy/reference/arrays.scalars.html>`__
* To select datetimes, use ``np.datetime64``, ``'datetime'`` or
``'datetime64'``
* To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or
``'timedelta64'``
* To select Pandas categorical dtypes, use ``'category'``
* To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in
0.20.0) or ``'datetime64[ns, tz]'``
Examples
--------
>>> df = pd.DataFrame({'a': [1, 2] * 3,
... 'b': [True, False] * 3,
... 'c': [1.0, 2.0] * 3})
>>> df
a b c
0 1 True 1.0
1 2 False 2.0
2 1 True 1.0
3 2 False 2.0
4 1 True 1.0
5 2 False 2.0
>>> df.select_dtypes(include='bool')
b
0 True
1 False
2 True
3 False
4 True
5 False
>>> df.select_dtypes(include=['float64'])
c
0 1.0
1 2.0
2 1.0
3 2.0
4 1.0
5 2.0
>>> df.select_dtypes(exclude=['int'])
b c
0 True 1.0
1 False 2.0
2 True 1.0
3 False 2.0
4 True 1.0
5 False 2.0
"""
if not is_list_like(include):
include = (include,) if include is not None else ()
if not is_list_like(exclude):
exclude = (exclude,) if exclude is not None else ()
selection = tuple(map(frozenset, (include, exclude)))
if not any(selection):
raise ValueError('at least one of include or exclude must be '
'nonempty')
# convert the myriad valid dtypes object to a single representation
include, exclude = map(
lambda x: frozenset(map(_get_dtype_from_object, x)), selection)
for dtypes in (include, exclude):
invalidate_string_dtypes(dtypes)
# can't both include AND exclude!
if not include.isdisjoint(exclude):
raise ValueError('include and exclude overlap on {inc_ex}'.format(
inc_ex=(include & exclude)))
# empty include/exclude -> defaults to True
# three cases (we've already raised if both are empty)
# case 1: empty include, nonempty exclude
# we have True, True, ... True for include, same for exclude
# in the loop below we get the excluded
# and when we call '&' below we get only the excluded
# case 2: nonempty include, empty exclude
# same as case 1, but with include
# case 3: both nonempty
# the "union" of the logic of case 1 and case 2:
# we get the included and excluded, and return their logical and
include_these = Series(not bool(include), index=self.columns)
exclude_these = Series(not bool(exclude), index=self.columns)
def is_dtype_instance_mapper(idx, dtype):
return idx, functools.partial(issubclass, dtype.type)
for idx, f in itertools.starmap(is_dtype_instance_mapper,
enumerate(self.dtypes)):
if include: # checks for the case of empty include or exclude
include_these.iloc[idx] = any(map(f, include))
if exclude:
exclude_these.iloc[idx] = not any(map(f, exclude))
dtype_indexer = include_these & exclude_these
return self.loc[com.get_info_slice(self, dtype_indexer)]
def _box_item_values(self, key, values):
items = self.columns[self.columns.get_loc(key)]
if values.ndim == 2:
return self._constructor(values.T, columns=items, index=self.index)
else:
return self._box_col_values(values, items)
def _box_col_values(self, values, items):
""" provide boxed values for a column """
klass = _get_sliced_frame_result_type(values, self)
return klass(values, index=self.index, name=items, fastpath=True)
def __setitem__(self, key, value):
key = com.apply_if_callable(key, self)
# see if we can slice the rows
indexer = convert_to_index_sliceable(self, key)
if indexer is not None:
return self._setitem_slice(indexer, value)
if isinstance(key, DataFrame) or getattr(key, 'ndim', None) == 2:
self._setitem_frame(key, value)
elif isinstance(key, (Series, np.ndarray, list, Index)):
self._setitem_array(key, value)
else:
# set column
self._set_item(key, value)
def _setitem_slice(self, key, value):
self._check_setitem_copy()
self.loc._setitem_with_indexer(key, value)
def _setitem_array(self, key, value):
# also raises Exception if object array with NA values
if com.is_bool_indexer(key):