diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 61efc6a707d31..a0c6f1332f339 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -262,8 +262,9 @@ after updating. Contributing to the documentation ================================= -If you're not the developer type, contributing to the documentation is still of -huge value. You don't even have to be an expert on *pandas* to do so! In fact, +Contributing to the documentation benefits everyone who uses *pandas*. +We encourage you to help us improve the documentation, and +you don't have to be an expert on *pandas* to do so! In fact, there are sections of the docs that are worse off after being written by experts. If something in the docs doesn't make sense to you, updating the relevant section after you figure it out is a great way to ensure it will help diff --git a/pandas/core/base.py b/pandas/core/base.py index 280b8849792e3..fd039480fc6f1 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1184,24 +1184,6 @@ def searchsorted(self, value, side='left', sorter=None): # needs coercion on the key (DatetimeIndex does already) return self.values.searchsorted(value, side=side, sorter=sorter) - _shared_docs['drop_duplicates'] = ( - """Return %(klass)s with duplicate values removed - - Parameters - ---------- - - keep : {'first', 'last', False}, default 'first' - - ``first`` : Drop duplicates except for the first occurrence. - - ``last`` : Drop duplicates except for the last occurrence. - - False : Drop all duplicates. - %(inplace)s - - Returns - ------- - deduplicated : %(klass)s - """) - - @Appender(_shared_docs['drop_duplicates'] % _indexops_doc_kwargs) def drop_duplicates(self, keep='first', inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') if isinstance(self, ABCIndexClass): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a66d00fff9714..d430d442fae0f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -883,27 +883,66 @@ def dot(self, other): @classmethod def from_dict(cls, data, orient='columns', dtype=None, columns=None): """ - Construct DataFrame from dict of array-like or dicts + Construct DataFrame from dict of array-like or dicts. + + Creates DataFrame object from dictionary by columns or by index + allowing dtype specification. Parameters ---------- data : dict - {field : array-like} or {field : dict} + Of the form {field : array-like} or {field : dict}. orient : {'columns', 'index'}, default 'columns' The "orientation" of the data. If the keys of the passed dict should be the columns of the resulting DataFrame, pass 'columns' (default). Otherwise if the keys should be rows, pass 'index'. dtype : dtype, default None - Data type to force, otherwise infer - columns: list, default None - Column labels to use when orient='index'. Raises a ValueError - if used with orient='columns' + Data type to force, otherwise infer. + columns : list, default None + Column labels to use when ``orient='index'``. Raises a ValueError + if used with ``orient='columns'``. .. versionadded:: 0.23.0 Returns ------- - DataFrame + pandas.DataFrame + + See Also + -------- + DataFrame.from_records : DataFrame from ndarray (structured + dtype), list of tuples, dict, or DataFrame + DataFrame : DataFrame object creation using constructor + + Examples + -------- + By default the keys of the dict become the DataFrame columns: + + >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']} + >>> pd.DataFrame.from_dict(data) + col_1 col_2 + 0 3 a + 1 2 b + 2 1 c + 3 0 d + + Specify ``orient='index'`` to create the DataFrame using dictionary + keys as rows: + + >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']} + >>> pd.DataFrame.from_dict(data, orient='index') + 0 1 2 3 + row_1 3 2 1 0 + row_2 a b c d + + When using the 'index' orientation, the column names can be + specified manually: + + >>> pd.DataFrame.from_dict(data, orient='index', + ... columns=['A', 'B', 'C', 'D']) + A B C D + row_1 3 2 1 0 + row_2 a b c d """ index = None orient = orient.lower() @@ -1209,20 +1248,68 @@ def from_records(cls, data, index=None, exclude=None, columns=None, def to_records(self, index=True, convert_datetime64=True): """ - Convert DataFrame to record array. Index will be put in the - 'index' field of the record array if requested + Convert DataFrame to a NumPy record array. + + Index will be put in the 'index' field of the record array if + requested. Parameters ---------- index : boolean, default True - Include index in resulting record array, stored in 'index' field + Include index in resulting record array, stored in 'index' field. convert_datetime64 : boolean, default True Whether to convert the index to datetime.datetime if it is a - DatetimeIndex + DatetimeIndex. Returns ------- - y : recarray + y : numpy.recarray + + See Also + -------- + DataFrame.from_records: convert structured or record ndarray + to DataFrame. + numpy.recarray: ndarray that allows field access using + attributes, analogous to typed columns in a + spreadsheet. + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]}, + ... index=['a', 'b']) + >>> df + A B + a 1 0.50 + b 2 0.75 + >>> df.to_records() + rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], + dtype=[('index', 'O'), ('A', '>> df.to_records(index=False) + rec.array([(1, 0.5 ), (2, 0.75)], + dtype=[('A', '>> df.index = pd.date_range('2018-01-01 09:00', periods=2, freq='min') + >>> df + A B + 2018-01-01 09:00:00 1 0.50 + 2018-01-01 09:01:00 2 0.75 + >>> df.to_records() + rec.array([(datetime.datetime(2018, 1, 1, 9, 0), 1, 0.5 ), + (datetime.datetime(2018, 1, 1, 9, 1), 2, 0.75)], + dtype=[('index', 'O'), ('A', '>> df.to_records(convert_datetime64=False) + rec.array([('2018-01-01T09:00:00.000000000', 1, 0.5 ), + ('2018-01-01T09:01:00.000000000', 2, 0.75)], + dtype=[('index', '>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6], + ... 'b': [1, 1, 2, 3, 5, 8], + ... 'c': [1, 4, 9, 16, 25, 36]}) + >>> df + a b c + 0 1 1 1 + 1 2 1 4 + 2 3 2 9 + 3 4 3 16 + 4 5 5 25 + 5 6 8 36 + + >>> df.diff() + a b c + 0 NaN NaN NaN + 1 1.0 0.0 3.0 + 2 1.0 1.0 5.0 + 3 1.0 1.0 7.0 + 4 1.0 2.0 9.0 + 5 1.0 3.0 11.0 + + Difference with previous column + + >>> df.diff(axis=1) + a b c + 0 NaN 0.0 0.0 + 1 NaN -1.0 3.0 + 2 NaN -1.0 7.0 + 3 NaN -1.0 13.0 + 4 NaN 0.0 20.0 + 5 NaN 2.0 28.0 + + Difference with 3rd previous row + + >>> df.diff(periods=3) + a b c + 0 NaN NaN NaN + 1 NaN NaN NaN + 2 NaN NaN NaN + 3 3.0 2.0 15.0 + 4 3.0 4.0 21.0 + 5 3.0 6.0 27.0 + + Difference with following row + + >>> df.diff(periods=-1) + a b c + 0 -1.0 0.0 -3.0 + 1 -1.0 -1.0 -5.0 + 2 -1.0 -1.0 -7.0 + 3 -1.0 -2.0 -9.0 + 4 -1.0 -3.0 -11.0 + 5 NaN NaN NaN """ bm_axis = self._get_block_manager_axis(axis) new_data = self._data.diff(n=periods, axis=bm_axis) @@ -5501,7 +5658,22 @@ def corr(self, method='pearson', min_periods=1): def cov(self, min_periods=None): """ - Compute pairwise covariance of columns, excluding NA/null values + Compute pairwise covariance of columns, excluding NA/null values. + + Compute the pairwise covariance among the series of a DataFrame. + The returned data frame is the `covariance matrix + `__ of the columns + of the DataFrame. + + Both NA and null values are automatically excluded from the + calculation. (See the note below about bias from missing values.) + A threshold can be set for the minimum number of + observations for each value created. Comparisons with observations + below this threshold will be returned as ``NaN``. + + This method is generally used for the analysis of time series data to + understand the relationship between different measures + across time. Parameters ---------- @@ -5511,12 +5683,71 @@ def cov(self, min_periods=None): Returns ------- - y : DataFrame + DataFrame + The covariance matrix of the series of the DataFrame. + + See Also + -------- + pandas.Series.cov : compute covariance with another Series + pandas.core.window.EWM.cov: expoential weighted sample covariance + pandas.core.window.Expanding.cov : expanding sample covariance + pandas.core.window.Rolling.cov : rolling sample covariance Notes ----- - `y` contains the covariance matrix of the DataFrame's time series. - The covariance is normalized by N-1 (unbiased estimator). + Returns the covariance matrix of the DataFrame's time series. + The covariance is normalized by N-1. + + For DataFrames that have Series that are missing data (assuming that + data is `missing at random + `__) + the returned covariance matrix will be an unbiased estimate + of the variance and covariance between the member Series. + + However, for many applications this estimate may not be acceptable + because the estimate covariance matrix is not guaranteed to be positive + semi-definite. This could lead to estimate correlations having + absolute values which are greater than one, and/or a non-invertible + covariance matrix. See `Estimation of covariance matrices + `__ for more details. + + Examples + -------- + >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)], + ... columns=['dogs', 'cats']) + >>> df.cov() + dogs cats + dogs 0.666667 -1.000000 + cats -1.000000 1.666667 + + >>> np.random.seed(42) + >>> df = pd.DataFrame(np.random.randn(1000, 5), + ... columns=['a', 'b', 'c', 'd', 'e']) + >>> df.cov() + a b c d e + a 0.998438 -0.020161 0.059277 -0.008943 0.014144 + b -0.020161 1.059352 -0.008543 -0.024738 0.009826 + c 0.059277 -0.008543 1.010670 -0.001486 -0.000271 + d -0.008943 -0.024738 -0.001486 0.921297 -0.013692 + e 0.014144 0.009826 -0.000271 -0.013692 0.977795 + + **Minimum number of periods** + + This method also supports an optional ``min_periods`` keyword + that specifies the required minimum number of non-NA observations for + each column pair in order to have a valid result: + + >>> np.random.seed(42) + >>> df = pd.DataFrame(np.random.randn(20, 3), + ... columns=['a', 'b', 'c']) + >>> df.loc[df.index[:5], 'a'] = np.nan + >>> df.loc[df.index[5:10], 'b'] = np.nan + >>> df.cov(min_periods=12) + a b c + a 0.316741 NaN -0.150812 + b NaN 1.248003 0.191417 + c -0.150812 0.191417 0.895202 """ numeric_df = self._get_numeric_data() cols = numeric_df.columns diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c4af950f88ce7..26e316677febe 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3605,7 +3605,11 @@ def f(x): def head(self, n=5): """ - Return the first n rows. + Return the first `n` rows. + + This function returns the first `n` rows for the object based + on position. It is useful for quickly testing if your object + has the right type of data in it. Parameters ---------- @@ -3615,11 +3619,11 @@ def head(self, n=5): Returns ------- obj_head : type of caller - The first n rows of the caller object. + The first `n` rows of the caller object. See Also -------- - pandas.DataFrame.tail + pandas.DataFrame.tail: Returns the last `n` rows. Examples -------- @@ -3647,7 +3651,7 @@ def head(self, n=5): 3 lion 4 monkey - Viewing the first n lines (three in this case) + Viewing the first `n` lines (three in this case) >>> df.head(3) animal @@ -3660,7 +3664,11 @@ def head(self, n=5): def tail(self, n=5): """ - Return the last n rows. + Return the last `n` rows. + + This function returns last `n` rows from the object based on + position. It is useful for quickly verifying data, for example, + after sorting or appending rows. Parameters ---------- @@ -3669,12 +3677,12 @@ def tail(self, n=5): Returns ------- - obj_tail : type of caller - The last n rows of the caller object. + type of caller + The last `n` rows of the caller object. See Also -------- - pandas.DataFrame.head + pandas.DataFrame.head : The first `n` rows of the caller object. Examples -------- @@ -3702,7 +3710,7 @@ def tail(self, n=5): 7 whale 8 zebra - Viewing the last n lines (three in this case) + Viewing the last `n` lines (three in this case) >>> df.tail(3) animal @@ -7577,11 +7585,10 @@ def _add_numeric_operations(cls): cls.any = _make_logical_function( cls, 'any', name, name2, axis_descr, 'Return whether any element is True over requested axis', - nanops.nanany) + nanops.nanany, '', '') cls.all = _make_logical_function( - cls, 'all', name, name2, axis_descr, - 'Return whether all elements are True over requested axis', - nanops.nanall) + cls, 'all', name, name2, axis_descr, _all_doc, + nanops.nanall, _all_examples, _all_see_also) @Substitution(outname='mad', desc="Return the mean absolute deviation of the values " @@ -7838,7 +7845,6 @@ def _doc_parms(cls): %(outname)s : %(name1)s or %(name2)s (if level specified)\n""" _bool_doc = """ - %(desc)s Parameters @@ -7846,17 +7852,71 @@ def _doc_parms(cls): axis : %(axis_descr)s skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result - will be NA + will be NA. level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a %(name1)s + particular level, collapsing into a %(name1)s. bool_only : boolean, default None Include only boolean columns. If None, will attempt to use everything, then use only boolean data. Not implemented for Series. +**kwargs : any, default None + Additional keywords have no affect but might be accepted for + compatibility with numpy. Returns ------- -%(outname)s : %(name1)s or %(name2)s (if level specified)\n""" +%(outname)s : %(name1)s or %(name2)s (if level specified) + +%(examples)s +%(see_also)s""" + +_all_doc = """\ +Return whether all elements are True over series or dataframe axis. + +Returns True if all elements within a series or along a dataframe +axis are non-zero, not-empty or not-False.""" + +_all_examples = """\ +Examples +-------- +Series + +>>> pd.Series([True, True]).all() +True +>>> pd.Series([True, False]).all() +False + +Dataframes + +Create a dataframe from a dictionary. + +>>> df = pd.DataFrame({'col1': [True, True], 'col2': [True, False]}) +>>> df + col1 col2 +0 True True +1 True False + +Default behaviour checks if column-wise values all return True. + +>>> df.all() +col1 True +col2 False +dtype: bool + +Adding axis=1 argument will check if row-wise values all return True. + +>>> df.all(axis=1) +0 True +1 False +dtype: bool +""" + +_all_see_also = """\ +See also +-------- +pandas.Series.all : Return True if all elements are True +pandas.DataFrame.any : Return True if one (or more) elements are True +""" _cnum_doc = """ @@ -8039,9 +8099,10 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): return set_function_name(cum_func, name, cls) -def _make_logical_function(cls, name, name1, name2, axis_descr, desc, f): +def _make_logical_function(cls, name, name1, name2, axis_descr, desc, f, + examples, see_also): @Substitution(outname=name, desc=desc, name1=name1, name2=name2, - axis_descr=axis_descr) + axis_descr=axis_descr, examples=examples, see_also=see_also) @Appender(_bool_doc) def logical_func(self, axis=None, bool_only=None, skipna=None, level=None, **kwargs): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 52283e4e223b4..e82b641db98fd 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -681,7 +681,47 @@ def _values(self): return self.values def get_values(self): - """ return the underlying data as an ndarray """ + """ + Return `Index` data as an `numpy.ndarray`. + + Returns + ------- + numpy.ndarray + A one-dimensional numpy array of the `Index` values. + + See Also + -------- + Index.values : The attribute that get_values wraps. + + Examples + -------- + Getting the `Index` values of a `DataFrame`: + + >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + ... index=['a', 'b', 'c'], columns=['A', 'B', 'C']) + >>> df + A B C + a 1 2 3 + b 4 5 6 + c 7 8 9 + >>> df.index.get_values() + array(['a', 'b', 'c'], dtype=object) + + Standalone `Index` values: + + >>> idx = pd.Index(['1', '2', '3']) + >>> idx.get_values() + array(['1', '2', '3'], dtype=object) + + `MultiIndex` arrays also have only one dimension: + + >>> midx = pd.MultiIndex.from_arrays([[1, 2, 3], ['a', 'b', 'c']], + ... names=('number', 'letter')) + >>> midx.get_values() + array([(1, 'a'), (2, 'b'), (3, 'c')], dtype=object) + >>> midx.get_values().ndim + 1 + """ return self.values @Appender(IndexOpsMixin.memory_usage.__doc__) @@ -1710,6 +1750,59 @@ def _invalid_indexer(self, form, key): kind=type(key))) def get_duplicates(self): + """ + Extract duplicated index elements. + + Returns a sorted list of index elements which appear more than once in + the index. + + Returns + ------- + array-like + List of duplicated indexes. + + See Also + -------- + Index.duplicated : Return boolean array denoting duplicates. + Index.drop_duplicates : Return Index with duplicates removed. + + Examples + -------- + + Works on different Index of types. + + >>> pd.Index([1, 2, 2, 3, 3, 3, 4]).get_duplicates() + [2, 3] + >>> pd.Index([1., 2., 2., 3., 3., 3., 4.]).get_duplicates() + [2.0, 3.0] + >>> pd.Index(['a', 'b', 'b', 'c', 'c', 'c', 'd']).get_duplicates() + ['b', 'c'] + >>> dates = pd.to_datetime(['2018-01-01', '2018-01-02', '2018-01-03', + ... '2018-01-03', '2018-01-04', '2018-01-04'], + ... format='%Y-%m-%d') + >>> pd.Index(dates).get_duplicates() + DatetimeIndex(['2018-01-03', '2018-01-04'], + dtype='datetime64[ns]', freq=None) + + Sorts duplicated elements even when indexes are unordered. + + >>> pd.Index([1, 2, 3, 2, 3, 4, 3]).get_duplicates() + [2, 3] + + Return empty array-like structure when all elements are unique. + + >>> pd.Index([1, 2, 3, 4]).get_duplicates() + [] + >>> dates = pd.to_datetime(['2018-01-01', '2018-01-02', '2018-01-03'], + ... format='%Y-%m-%d') + >>> pd.Index(dates).get_duplicates() + DatetimeIndex([], dtype='datetime64[ns]', freq=None) + + Notes + ----- + In case of datetime-like indexes, the function is overridden where the + result is converted to DatetimeIndex. + """ from collections import defaultdict counter = defaultdict(lambda: 0) for k in self.values: @@ -2256,18 +2349,19 @@ def sortlevel(self, level=None, ascending=True, sort_remaining=None): return self.sort_values(return_indexer=True, ascending=ascending) def shift(self, periods=1, freq=None): - """Shift index by desired number of time frequency increments. + """ + Shift index by desired number of time frequency increments. This method is for shifting the values of datetime-like indexes by a specified time increment a given number of times. Parameters ---------- - periods : int + periods : int, default 1 Number of periods (or increments) to shift by, - can be positive or negative (default is 1). - freq : pandas.DateOffset, pandas.Timedelta or string - Frequency increment to shift by (default is None). + can be positive or negative. + freq : pandas.DateOffset, pandas.Timedelta or string, optional + Frequency increment to shift by. If None, the index is shifted by its own `freq` attribute. Offset aliases are valid strings, e.g., 'D', 'W', 'M' etc. @@ -2276,6 +2370,10 @@ def shift(self, periods=1, freq=None): pandas.Index shifted index + See Also + -------- + Series.shift : Shift values of Series. + Examples -------- Put the first 5 month starts of 2011 into an index. @@ -4017,8 +4115,52 @@ def unique(self, level=None): result = super(Index, self).unique() return self._shallow_copy(result) - @Appender(base._shared_docs['drop_duplicates'] % _index_doc_kwargs) def drop_duplicates(self, keep='first'): + """ + Return Index with duplicate values removed. + + Parameters + ---------- + keep : {'first', 'last', ``False``}, default 'first' + - 'first' : Drop duplicates except for the first occurrence. + - 'last' : Drop duplicates except for the last occurrence. + - ``False`` : Drop all duplicates. + + Returns + ------- + deduplicated : Index + + See Also + -------- + Series.drop_duplicates : equivalent method on Series + DataFrame.drop_duplicates : equivalent method on DataFrame + Index.duplicated : related method on Index, indicating duplicate + Index values. + + Examples + -------- + Generate an pandas.Index with duplicate values. + + >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) + + The `keep` parameter controls which duplicate values are removed. + The value 'first' keeps the first occurrence for each + set of duplicated entries. The default value of keep is 'first'. + + >>> idx.drop_duplicates(keep='first') + Index(['lama', 'cow', 'beetle', 'hippo'], dtype='object') + + The value 'last' keeps the last occurrence for each set of duplicated + entries. + + >>> idx.drop_duplicates(keep='last') + Index(['cow', 'beetle', 'lama', 'hippo'], dtype='object') + + The value ``False`` discards all sets of duplicated entries. + + >>> idx.drop_duplicates(keep=False) + Index(['cow', 'beetle', 'hippo'], dtype='object') + """ return super(Index, self).drop_duplicates(keep=keep) @Appender(base._shared_docs['duplicated'] % _index_doc_kwargs) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 491fefe8efee0..b9c4b59536d0c 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -2204,29 +2204,30 @@ def _generate_regular_range(start, end, periods, offset): def date_range(start=None, end=None, periods=None, freq='D', tz=None, normalize=False, name=None, closed=None, **kwargs): """ - Return a fixed frequency DatetimeIndex, with day (calendar) as the default - frequency + Return a fixed frequency DatetimeIndex. + + The default frequency is day (calendar). Parameters ---------- start : string or datetime-like, default None - Left bound for generating dates + Left bound for generating dates. end : string or datetime-like, default None - Right bound for generating dates + Right bound for generating dates. periods : integer, default None - Number of periods to generate + Number of periods to generate. freq : string or DateOffset, default 'D' (calendar daily) - Frequency strings can have multiples, e.g. '5H' + Frequency strings can have multiples, e.g. '5H'. tz : string, default None Time zone name for returning localized DatetimeIndex, for example - Asia/Hong_Kong + Asia/Hong_Kong. normalize : bool, default False - Normalize start/end dates to midnight before generating date range + Normalize start/end dates to midnight before generating date range. name : string, default None - Name of the resulting DatetimeIndex + Name of the resulting DatetimeIndex. closed : string, default None Make the interval closed with respect to the given frequency to - the 'left', 'right', or both sides (None) + the 'left', 'right', or both sides (None). Notes ----- @@ -2239,6 +2240,22 @@ def date_range(start=None, end=None, periods=None, freq='D', tz=None, Returns ------- rng : DatetimeIndex + + See Also + -------- + pandas.period_range : Return a fixed frequency PeriodIndex. + pandas.interval_range : Return a fixed frequency IntervalIndex. + + Examples + -------- + >>> pd.date_range('2018-10-03', periods=2) # doctest: +NORMALIZE_WHITESPACE + DatetimeIndex(['2018-10-03', '2018-10-04'], dtype='datetime64[ns]', + freq='D') + + >>> pd.date_range(start='2018-01-01', end='20180103') + ... # doctest: +NORMALIZE_WHITESPACE + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'], + dtype='datetime64[ns]', freq='D') """ return DatetimeIndex(start=start, end=end, periods=periods, freq=freq, tz=tz, normalize=normalize, name=name, diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index ec2874b3bae95..560e7638b5510 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1920,11 +1920,49 @@ def _convert_key(self, key, is_setter=False): class _iAtIndexer(_ScalarAccessIndexer): - """Fast integer location scalar accessor. + """ + Access a single value for a row/column pair by integer position. - Similarly to ``iloc``, ``iat`` provides **integer** based lookups. - You can also set using these indexers. + Similar to ``iloc``, in that both provide integer-based lookups. Use + ``iat`` if you only need to get or set a single value in a DataFrame + or Series. + + See Also + -------- + DataFrame.at : Access a single value for a row/column label pair + DataFrame.loc : Access a group of rows and columns by label(s) + DataFrame.iloc : Access a group of rows and columns by integer position(s) + + Examples + -------- + >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], + ... columns=['A', 'B', 'C']) + >>> df + A B C + 0 0 2 3 + 1 0 4 1 + 2 10 20 30 + + Get value at specified row/column pair + >>> df.iat[1, 2] + 1 + + Set value at specified row/column pair + + >>> df.iat[1, 2] = 10 + >>> df.iat[1, 2] + 10 + + Get value within a series + + >>> df.loc[0].iat[1] + 2 + + Raises + ------ + IndexError + When integer position is out of bounds """ _takeable = True diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 83879cdaaa63c..6c6a54993b669 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -343,50 +343,93 @@ def _get_op_name(op, special): # ----------------------------------------------------------------------------- # Docstring Generation and Templates +_add_example_FRAME = """ +>>> a = pd.DataFrame([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd'], + columns=['one']) +>>> a + one +a 1.0 +b 1.0 +c 1.0 +d NaN +>>> b = pd.DataFrame(dict(one=[1, np.nan, 1, np.nan], + two=[np.nan, 2, np.nan, 2]), + index=['a', 'b', 'd', 'e']) +>>> b + one two +a 1.0 NaN +b NaN 2.0 +d 1.0 NaN +e NaN 2.0 +>>> a.add(b, fill_value=0) + one two +a 2.0 NaN +b 1.0 2.0 +c 1.0 NaN +d 1.0 NaN +e NaN 2.0 +""" + _op_descriptions = { + # Arithmetic Operators 'add': {'op': '+', 'desc': 'Addition', - 'reverse': 'radd'}, + 'reverse': 'radd', + 'df_examples': _add_example_FRAME}, 'sub': {'op': '-', 'desc': 'Subtraction', - 'reverse': 'rsub'}, + 'reverse': 'rsub', + 'df_examples': None}, 'mul': {'op': '*', 'desc': 'Multiplication', - 'reverse': 'rmul'}, + 'reverse': 'rmul', + 'df_examples': None}, 'mod': {'op': '%', 'desc': 'Modulo', - 'reverse': 'rmod'}, + 'reverse': 'rmod', + 'df_examples': None}, 'pow': {'op': '**', 'desc': 'Exponential power', - 'reverse': 'rpow'}, + 'reverse': 'rpow', + 'df_examples': None}, 'truediv': {'op': '/', 'desc': 'Floating division', - 'reverse': 'rtruediv'}, + 'reverse': 'rtruediv', + 'df_examples': None}, 'floordiv': {'op': '//', 'desc': 'Integer division', - 'reverse': 'rfloordiv'}, + 'reverse': 'rfloordiv', + 'df_examples': None}, 'divmod': {'op': 'divmod', 'desc': 'Integer division and modulo', - 'reverse': None}, + 'reverse': None, + 'df_examples': None}, + # Comparison Operators 'eq': {'op': '==', - 'desc': 'Equal to', - 'reverse': None}, + 'desc': 'Equal to', + 'reverse': None, + 'df_examples': None}, 'ne': {'op': '!=', - 'desc': 'Not equal to', - 'reverse': None}, + 'desc': 'Not equal to', + 'reverse': None, + 'df_examples': None}, 'lt': {'op': '<', - 'desc': 'Less than', - 'reverse': None}, + 'desc': 'Less than', + 'reverse': None, + 'df_examples': None}, 'le': {'op': '<=', - 'desc': 'Less than or equal to', - 'reverse': None}, + 'desc': 'Less than or equal to', + 'reverse': None, + 'df_examples': None}, 'gt': {'op': '>', - 'desc': 'Greater than', - 'reverse': None}, + 'desc': 'Greater than', + 'reverse': None, + 'df_examples': None}, 'ge': {'op': '>=', - 'desc': 'Greater than or equal to', - 'reverse': None}} + 'desc': 'Greater than or equal to', + 'reverse': None, + 'df_examples': None}} _op_names = list(_op_descriptions.keys()) for key in _op_names: @@ -532,30 +575,7 @@ def _get_op_name(op, special): Examples -------- ->>> a = pd.DataFrame([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd'], - columns=['one']) ->>> a - one -a 1.0 -b 1.0 -c 1.0 -d NaN ->>> b = pd.DataFrame(dict(one=[1, np.nan, 1, np.nan], - two=[np.nan, 2, np.nan, 2]), - index=['a', 'b', 'd', 'e']) ->>> b - one two -a 1.0 NaN -b NaN 2.0 -d 1.0 NaN -e NaN 2.0 ->>> a.add(b, fill_value=0) - one two -a 2.0 NaN -b 1.0 2.0 -c 1.0 NaN -d 1.0 NaN -e NaN 2.0 +{df_examples} See also -------- @@ -622,14 +642,19 @@ def _make_flex_doc(op_name, typ): if typ == 'series': base_doc = _flex_doc_SERIES + doc = base_doc.format(desc=op_desc['desc'], op_name=op_name, + equiv=equiv, reverse=op_desc['reverse']) elif typ == 'dataframe': base_doc = _flex_doc_FRAME + doc = base_doc.format(desc=op_desc['desc'], op_name=op_name, + equiv=equiv, reverse=op_desc['reverse'], + df_examples=op_desc['df_examples']) elif typ == 'panel': base_doc = _flex_doc_PANEL + doc = base_doc.format(desc=op_desc['desc'], op_name=op_name, + equiv=equiv, reverse=op_desc['reverse']) else: raise AssertionError('Invalid typ argument.') - doc = base_doc.format(desc=op_desc['desc'], op_name=op_name, - equiv=equiv, reverse=op_desc['reverse']) return doc diff --git a/pandas/core/series.py b/pandas/core/series.py index 069f0372ab6e1..19a9a0cf3da0f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -547,6 +547,71 @@ def __len__(self): return len(self._data) def view(self, dtype=None): + """ + Create a new view of the Series. + + This function will return a new Series with a view of the same + underlying values in memory, optionally reinterpreted with a new data + type. The new data type must preserve the same size in bytes as to not + cause index misalignment. + + Parameters + ---------- + dtype : data type + Data type object or one of their string representations. + + Returns + ------- + Series + A new Series object as a view of the same data in memory. + + See Also + -------- + numpy.ndarray.view : Equivalent numpy function to create a new view of + the same data in memory. + + Notes + ----- + Series are instantiated with ``dtype=float64`` by default. While + ``numpy.ndarray.view()`` will return a view with the same data type as + the original array, ``Series.view()`` (without specified dtype) + will try using ``float64`` and may fail if the original data type size + in bytes is not the same. + + Examples + -------- + >>> s = pd.Series([-2, -1, 0, 1, 2], dtype='int8') + >>> s + 0 -2 + 1 -1 + 2 0 + 3 1 + 4 2 + dtype: int8 + + The 8 bit signed integer representation of `-1` is `0b11111111`, but + the same bytes represent 255 if read as an 8 bit unsigned integer: + + >>> us = s.view('uint8') + >>> us + 0 254 + 1 255 + 2 0 + 3 1 + 4 2 + dtype: uint8 + + The views share the same underlying values: + + >>> us[0] = 128 + >>> s + 0 -128 + 1 -1 + 2 0 + 3 1 + 4 2 + dtype: int8 + """ return self._constructor(self._values.view(dtype), index=self.index).__finalize__(self) @@ -1316,8 +1381,77 @@ def unique(self): return result - @Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs) def drop_duplicates(self, keep='first', inplace=False): + """ + Return Series with duplicate values removed. + + Parameters + ---------- + keep : {'first', 'last', ``False``}, default 'first' + - 'first' : Drop duplicates except for the first occurrence. + - 'last' : Drop duplicates except for the last occurrence. + - ``False`` : Drop all duplicates. + inplace : boolean, default ``False`` + If ``True``, performs operation inplace and returns None. + + Returns + ------- + deduplicated : Series + + See Also + -------- + Index.drop_duplicates : equivalent method on Index + DataFrame.drop_duplicates : equivalent method on DataFrame + Series.duplicated : related method on Series, indicating duplicate + Series values. + + Examples + -------- + Generate an Series with duplicated entries. + + >>> s = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'], + ... name='animal') + >>> s + 0 lama + 1 cow + 2 lama + 3 beetle + 4 lama + 5 hippo + Name: animal, dtype: object + + With the 'keep' parameter, the selection behaviour of duplicated values + can be changed. The value 'first' keeps the first occurrence for each + set of duplicated entries. The default value of keep is 'first'. + + >>> s.drop_duplicates() + 0 lama + 1 cow + 3 beetle + 5 hippo + Name: animal, dtype: object + + The value 'last' for parameter 'keep' keeps the last occurrence for + each set of duplicated entries. + + >>> s.drop_duplicates(keep='last') + 1 cow + 3 beetle + 4 lama + 5 hippo + Name: animal, dtype: object + + The value ``False`` for parameter 'keep' discards all sets of + duplicated entries. Setting the value of 'inplace' to ``True`` performs + the operation inplace and returns ``None``. + + >>> s.drop_duplicates(keep=False, inplace=True) + >>> s + 1 cow + 3 beetle + 5 hippo + Name: animal, dtype: object + """ return super(Series, self).drop_duplicates(keep=keep, inplace=inplace) @Appender(base._shared_docs['duplicated'] % _shared_doc_kwargs) @@ -1538,16 +1672,63 @@ def cov(self, other, min_periods=None): def diff(self, periods=1): """ - 1st discrete difference of object + First discrete difference of element. + + Calculates the difference of a Series element compared with another + element in the Series (default is element in previous row). Parameters ---------- periods : int, default 1 - Periods to shift for forming difference + Periods to shift for calculating difference, accepts negative + values. Returns ------- diffed : Series + + See Also + -------- + Series.pct_change: Percent change over given number of periods. + Series.shift: Shift index by desired number of periods with an + optional time freq. + DataFrame.diff: First discrete difference of object + + Examples + -------- + Difference with previous row + + >>> s = pd.Series([1, 1, 2, 3, 5, 8]) + >>> s.diff() + 0 NaN + 1 0.0 + 2 1.0 + 3 1.0 + 4 2.0 + 5 3.0 + dtype: float64 + + Difference with 3rd previous row + + >>> s.diff(periods=3) + 0 NaN + 1 NaN + 2 NaN + 3 2.0 + 4 4.0 + 5 6.0 + dtype: float64 + + Difference with following row + + >>> s.diff(periods=-1) + 0 0.0 + 1 -1.0 + 2 -1.0 + 3 -2.0 + 4 -3.0 + 5 NaN + dtype: float64 """ result = algorithms.diff(com._values_from_object(self), periods) return self._constructor(result, index=self.index).__finalize__(self) @@ -2696,28 +2877,54 @@ def reindex_axis(self, labels, axis=0, **kwargs): return self.reindex(index=labels, **kwargs) def memory_usage(self, index=True, deep=False): - """Memory usage of the Series + """ + Return the memory usage of the Series. + + The memory usage can optionally include the contribution of + the index and of elements of `object` dtype. Parameters ---------- - index : bool - Specifies whether to include memory usage of Series index - deep : bool - Introspect the data deeply, interrogate - `object` dtypes for system-level memory consumption + index : bool, default True + Specifies whether to include the memory usage of the Series index. + deep : bool, default False + If True, introspect the data deeply by interrogating + `object` dtypes for system-level memory consumption, and include + it in the returned value. Returns ------- - scalar bytes of memory consumed - - Notes - ----- - Memory usage does not include memory consumed by elements that - are not components of the array if deep=False + int + Bytes of memory consumed. See Also -------- - numpy.ndarray.nbytes + numpy.ndarray.nbytes : Total bytes consumed by the elements of the + array. + DataFrame.memory_usage : Bytes consumed by a DataFrame. + + Examples + -------- + + >>> s = pd.Series(range(3)) + >>> s.memory_usage() + 104 + + Not including the index gives the size of the rest of the data, which + is necessarily smaller: + + >>> s.memory_usage(index=False) + 24 + + The memory footprint of `object` values is ignored by default: + + >>> s = pd.Series(["a", "b"]) + >>> s.values + array(['a', 'b'], dtype=object) + >>> s.memory_usage() + 96 + >>> s.memory_usage(deep=True) + 212 """ v = super(Series, self).memory_usage(deep=deep) if index: @@ -2745,20 +2952,21 @@ def _take(self, indices, axis=0, convert=True, is_copy=False): def isin(self, values): """ - Return a boolean :class:`~pandas.Series` showing whether each element - in the :class:`~pandas.Series` is exactly contained in the passed - sequence of ``values``. + Check whether `values` are contained in Series. + + Return a boolean Series showing whether each element in the Series + matches an element in the passed sequence of `values` exactly. Parameters ---------- values : set or list-like The sequence of values to test. Passing in a single string will raise a ``TypeError``. Instead, turn a single string into a - ``list`` of one element. + list of one element. .. versionadded:: 0.18.1 - Support for values as a set + Support for values as a set. Returns ------- @@ -2767,31 +2975,37 @@ def isin(self, values): Raises ------ TypeError - * If ``values`` is a string + * If `values` is a string See Also -------- - pandas.DataFrame.isin + pandas.DataFrame.isin : equivalent method on DataFrame Examples -------- - >>> s = pd.Series(list('abc')) - >>> s.isin(['a', 'c', 'e']) + >>> s = pd.Series(['lama', 'cow', 'lama', 'beetle', 'lama', + ... 'hippo'], name='animal') + >>> s.isin(['cow', 'lama']) 0 True - 1 False + 1 True 2 True - dtype: bool + 3 False + 4 True + 5 False + Name: animal, dtype: bool - Passing a single string as ``s.isin('a')`` will raise an error. Use + Passing a single string as ``s.isin('lama')`` will raise an error. Use a list of one element instead: - >>> s.isin(['a']) + >>> s.isin(['lama']) 0 True 1 False - 2 False - dtype: bool - + 2 True + 3 False + 4 True + 5 False + Name: animal, dtype: bool """ result = algorithms.isin(com._values_from_object(self), values) return self._constructor(result, index=self.index).__finalize__(self) diff --git a/pandas/core/window.py b/pandas/core/window.py index c41b07759d555..5294cdfd5662d 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -926,28 +926,7 @@ def skew(self, **kwargs): Notes ----- - A minimum of 4 periods is required for the rolling calculation. - - Examples - -------- - The below example will show a rolling calculation with a window size of - four matching the equivalent function call using `scipy.stats`. - - >>> arr = [1, 2, 3, 4, 999] - >>> fmt = "{0:.6f}" # limit the printed precision to 6 digits - >>> import scipy.stats - >>> print(fmt.format(scipy.stats.kurtosis(arr[:-1], bias=False))) - -1.200000 - >>> print(fmt.format(scipy.stats.kurtosis(arr[1:], bias=False))) - 3.999946 - >>> s = pd.Series(arr) - >>> s.rolling(4).kurt() - 0 NaN - 1 NaN - 2 NaN - 3 -1.200000 - 4 3.999946 - dtype: float64 + A minimum of 4 periods is required for the %(name)s calculation. """) def kurt(self, **kwargs): @@ -1269,6 +1248,31 @@ def var(self, ddof=1, *args, **kwargs): def skew(self, **kwargs): return super(Rolling, self).skew(**kwargs) + _agg_doc = dedent(""" + Examples + -------- + + The example below will show a rolling calculation with a window size of + four matching the equivalent function call using `scipy.stats`. + + >>> arr = [1, 2, 3, 4, 999] + >>> fmt = "{0:.6f}" # limit the printed precision to 6 digits + >>> import scipy.stats + >>> print(fmt.format(scipy.stats.kurtosis(arr[:-1], bias=False))) + -1.200000 + >>> print(fmt.format(scipy.stats.kurtosis(arr[1:], bias=False))) + 3.999946 + >>> s = pd.Series(arr) + >>> s.rolling(4).kurt() + 0 NaN + 1 NaN + 2 NaN + 3 -1.200000 + 4 3.999946 + dtype: float64 + """) + + @Appender(_agg_doc) @Substitution(name='rolling') @Appender(_shared_docs['kurt']) def kurt(self, **kwargs): @@ -1508,6 +1512,31 @@ def var(self, ddof=1, *args, **kwargs): def skew(self, **kwargs): return super(Expanding, self).skew(**kwargs) + _agg_doc = dedent(""" + Examples + -------- + + The example below will show an expanding calculation with a window size of + four matching the equivalent function call using `scipy.stats`. + + >>> arr = [1, 2, 3, 4, 999] + >>> import scipy.stats + >>> fmt = "{0:.6f}" # limit the printed precision to 6 digits + >>> print(fmt.format(scipy.stats.kurtosis(arr[:-1], bias=False))) + -1.200000 + >>> print(fmt.format(scipy.stats.kurtosis(arr, bias=False))) + 4.999874 + >>> s = pd.Series(arr) + >>> s.expanding(4).kurt() + 0 NaN + 1 NaN + 2 NaN + 3 -1.200000 + 4 4.999874 + dtype: float64 + """) + + @Appender(_agg_doc) @Substitution(name='expanding') @Appender(_shared_docs['kurt']) def kurt(self, **kwargs): diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 98fdcf8f94ae0..520c6cecce6d7 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -2532,7 +2532,8 @@ def line(self, **kwds): Parameters ---------- `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.Series.plot`. + Additional keyword arguments are documented in + :meth:`pandas.Series.plot`. Returns ------- @@ -2556,7 +2557,8 @@ def bar(self, **kwds): Parameters ---------- `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.Series.plot`. + Additional keyword arguments are documented in + :meth:`pandas.Series.plot`. Returns ------- @@ -2571,7 +2573,8 @@ def barh(self, **kwds): Parameters ---------- `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.Series.plot`. + Additional keyword arguments are documented in + :meth:`pandas.Series.plot`. Returns ------- @@ -2586,7 +2589,8 @@ def box(self, **kwds): Parameters ---------- `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.Series.plot`. + Additional keyword arguments are documented in + :meth:`pandas.Series.plot`. Returns ------- @@ -2603,7 +2607,8 @@ def hist(self, bins=10, **kwds): bins: integer, default 10 Number of histogram bins to be used `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.Series.plot`. + Additional keyword arguments are documented in + :meth:`pandas.Series.plot`. Returns ------- @@ -2613,26 +2618,74 @@ def hist(self, bins=10, **kwds): def kde(self, bw_method=None, ind=None, **kwds): """ - Kernel Density Estimate plot + Kernel Density Estimate plot using Gaussian kernels. + + In statistics, kernel density estimation (KDE) is a non-parametric way + to estimate the probability density function (PDF) of a random + variable. This function uses Gaussian kernels and includes automatic + bandwith determination. Parameters ---------- - bw_method: str, scalar or callable, optional - The method used to calculate the estimator bandwidth. This can be + bw_method : str, scalar or callable, optional + The method used to calculate the estimator bandwidth. This can be 'scott', 'silverman', a scalar constant or a callable. If None (default), 'scott' is used. See :class:`scipy.stats.gaussian_kde` for more information. ind : NumPy array or integer, optional - Evaluation points. If None (default), 1000 equally spaced points - are used. If `ind` is a NumPy array, the kde is evaluated at the - points passed. If `ind` is an integer, `ind` number of equally - spaced points are used. - `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.Series.plot`. + Evaluation points for the estimated PDF. If None (default), + 1000 equally spaced points are used. If `ind` is a NumPy array, the + kde is evaluated at the points passed. If `ind` is an integer, + `ind` number of equally spaced points are used. + kwds : optional + Additional keyword arguments are documented in + :meth:`pandas.Series.plot`. Returns ------- axes : matplotlib.AxesSubplot or np.array of them + + See also + -------- + scipy.stats.gaussian_kde : Representation of a kernel-density + estimate using Gaussian kernels. This is the function used + internally to estimate the PDF. + + Examples + -------- + Given a Series of points randomly sampled from an unknown + distribution, estimate this distribution using KDE with automatic + bandwidth determination and plot the results, evaluating them at + 1000 equally spaced points (default): + + .. plot:: + :context: close-figs + + >>> s = pd.Series([1, 2, 2.5, 3, 3.5, 4, 5]) + >>> ax = s.plot.kde() + + + An scalar fixed bandwidth can be specified. Using a too small bandwidth + can lead to overfitting, while a too large bandwidth can result in + underfitting: + + .. plot:: + :context: close-figs + + >>> ax = s.plot.kde(bw_method=0.3) + + .. plot:: + :context: close-figs + + >>> ax = s.plot.kde(bw_method=3) + + Finally, the `ind` parameter determines the evaluation points for the + plot of the estimated PDF: + + .. plot:: + :context: close-figs + + >>> ax = s.plot.kde(ind=[1, 2, 3, 4, 5]) """ return self(kind='kde', bw_method=bw_method, ind=ind, **kwds) @@ -2645,7 +2698,8 @@ def area(self, **kwds): Parameters ---------- `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.Series.plot`. + Additional keyword arguments are documented in + :meth:`pandas.Series.plot`. Returns ------- @@ -2660,7 +2714,8 @@ def pie(self, **kwds): Parameters ---------- `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.Series.plot`. + Additional keyword arguments are documented in + :meth:`pandas.Series.plot`. Returns ------- @@ -2711,7 +2766,8 @@ def line(self, x=None, y=None, **kwds): x, y : label or position, optional Coordinates for each point. `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.DataFrame.plot`. + Additional keyword arguments are documented in + :meth:`pandas.DataFrame.plot`. Returns ------- @@ -2728,7 +2784,8 @@ def bar(self, x=None, y=None, **kwds): x, y : label or position, optional Coordinates for each point. `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.DataFrame.plot`. + Additional keyword arguments are documented in + :meth:`pandas.DataFrame.plot`. Returns ------- @@ -2745,7 +2802,8 @@ def barh(self, x=None, y=None, **kwds): x, y : label or position, optional Coordinates for each point. `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.DataFrame.plot`. + Additional keyword arguments are documented in + :meth:`pandas.DataFrame.plot`. Returns ------- @@ -2762,7 +2820,8 @@ def box(self, by=None, **kwds): by : string or sequence Column in the DataFrame to group by. `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.DataFrame.plot`. + Additional keyword arguments are documented in + :meth:`pandas.DataFrame.plot`. Returns ------- @@ -2781,7 +2840,8 @@ def hist(self, by=None, bins=10, **kwds): bins: integer, default 10 Number of histogram bins to be used `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.DataFrame.plot`. + Additional keyword arguments are documented in + :meth:`pandas.DataFrame.plot`. Returns ------- @@ -2806,7 +2866,8 @@ def kde(self, bw_method=None, ind=None, **kwds): points passed. If `ind` is an integer, `ind` number of equally spaced points are used. `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.DataFrame.plot`. + Additional keyword arguments are documented in + :meth:`pandas.DataFrame.plot`. Returns ------- @@ -2825,7 +2886,8 @@ def area(self, x=None, y=None, **kwds): x, y : label or position, optional Coordinates for each point. `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.DataFrame.plot`. + Additional keyword arguments are documented in + :meth:`pandas.DataFrame.plot`. Returns ------- @@ -2842,7 +2904,8 @@ def pie(self, y=None, **kwds): y : label or position, optional Column to plot. `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.DataFrame.plot`. + Additional keyword arguments are documented in + :meth:`pandas.DataFrame.plot`. Returns ------- @@ -2863,7 +2926,8 @@ def scatter(self, x, y, s=None, c=None, **kwds): c : label or position, optional Color of each point. `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.DataFrame.plot`. + Additional keyword arguments are documented in + :meth:`pandas.DataFrame.plot`. Returns ------- @@ -2888,7 +2952,8 @@ def hexbin(self, x, y, C=None, reduce_C_function=None, gridsize=None, gridsize : int, optional Number of bins. `**kwds` : optional - Keyword arguments to pass on to :py:meth:`pandas.DataFrame.plot`. + Additional keyword arguments are documented in + :meth:`pandas.DataFrame.plot`. Returns ------- diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 45594e9c6ea95..150c9274d4e5c 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -147,25 +147,66 @@ def _get_marker_compat(marker): def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): - """RadViz - a multivariate data visualization algorithm + """ + Plot a multidimensional dataset in 2D. + + Each Series in the DataFrame is represented as a evenly distributed + slice on a circle. Each data point is rendered in the circle according to + the value on each Series. Highly correlated `Series` in the `DataFrame` + are placed closer on the unit circle. + + RadViz allow to project a N-dimensional data set into a 2D space where the + influence of each dimension can be interpreted as a balance between the + influence of all dimensions. + + More info available at the `original article + `_ + describing RadViz. Parameters ---------- - frame: DataFrame - class_column: str - Column name containing class names - ax: Matplotlib axis object, optional - color: list or tuple, optional - Colors to use for the different classes - colormap : str or matplotlib colormap object, default None - Colormap to select colors from. If string, load colormap with that name - from matplotlib. - kwds: keywords - Options to pass to matplotlib scatter plotting method + frame : `DataFrame` + Pandas object holding the data. + class_column : str + Column name containing the name of the data point category. + ax : :class:`matplotlib.axes.Axes`, optional + A plot instance to which to add the information. + color : list[str] or tuple[str], optional + Assign a color to each category. Example: ['blue', 'green']. + colormap : str or :class:`matplotlib.colors.Colormap`, default None + Colormap to select colors from. If string, load colormap with that + name from matplotlib. + kwds : optional + Options to pass to matplotlib scatter plotting method. Returns ------- - ax: Matplotlib axis object + axes : :class:`matplotlib.axes.Axes` + + See Also + -------- + pandas.plotting.andrews_curves : Plot clustering visualization + + Examples + -------- + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame({ + ... 'SepalLength': [6.5, 7.7, 5.1, 5.8, 7.6, 5.0, 5.4, 4.6, + ... 6.7, 4.6], + ... 'SepalWidth': [3.0, 3.8, 3.8, 2.7, 3.0, 2.3, 3.0, 3.2, + ... 3.3, 3.6], + ... 'PetalLength': [5.5, 6.7, 1.9, 5.1, 6.6, 3.3, 4.5, 1.4, + ... 5.7, 1.0], + ... 'PetalWidth': [1.8, 2.2, 0.4, 1.9, 2.1, 1.0, 1.5, 0.2, + ... 2.1, 0.2], + ... 'Category': ['virginica', 'virginica', 'setosa', + ... 'virginica', 'virginica', 'versicolor', + ... 'versicolor', 'setosa', 'virginica', + ... 'setosa'] + ... }) + >>> rad_viz = pd.plotting.radviz(df, 'Category') """ import matplotlib.pyplot as plt import matplotlib.patches as patches @@ -323,20 +364,51 @@ def f(t): def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): - """Bootstrap plot. + """ + Bootstrap plot on mean, median and mid-range statistics. + + The bootstrap plot is used to estimate the uncertainty of a statistic + by relaying on random sampling with replacement [1]_. This function will + generate bootstrapping plots for mean, median and mid-range statistics + for the given number of samples of the given size. + + .. [1] "Bootstrapping (statistics)" in \ + https://en.wikipedia.org/wiki/Bootstrapping_%28statistics%29 Parameters ---------- - series: Time series - fig: matplotlib figure object, optional - size: number of data points to consider during each sampling - samples: number of times the bootstrap procedure is performed - kwds: optional keyword arguments for plotting commands, must be accepted - by both hist and plot + series : pandas.Series + Pandas Series from where to get the samplings for the bootstrapping. + fig : matplotlib.figure.Figure, default None + If given, it will use the `fig` reference for plotting instead of + creating a new one with default parameters. + size : int, default 50 + Number of data points to consider during each sampling. It must be + greater or equal than the length of the `series`. + samples : int, default 500 + Number of times the bootstrap procedure is performed. + **kwds : + Options to pass to matplotlib plotting method. Returns ------- - fig: matplotlib figure + fig : matplotlib.figure.Figure + Matplotlib figure + + See Also + -------- + pandas.DataFrame.plot : Basic plotting for DataFrame objects. + pandas.Series.plot : Basic plotting for Series objects. + + Examples + -------- + + .. plot:: + :context: close-figs + + >>> import numpy as np + >>> s = pd.Series(np.random.uniform(size=100)) + >>> fig = pd.plotting.bootstrap_plot(s) """ import random import matplotlib.pyplot as plt