diff --git a/pandas/core/resample.py b/pandas/core/resample.py index e8803b6f30fce..44304bcc7f388 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1,7 +1,6 @@ from __future__ import annotations import copy -from textwrap import dedent from typing import ( TYPE_CHECKING, Concatenate, @@ -28,11 +27,6 @@ ) from pandas._typing import NDFrameT from pandas.errors import AbstractMethodError -from pandas.util._decorators import ( - Appender, - Substitution, - doc, -) from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.dtypes import ( @@ -53,12 +47,10 @@ ) from pandas.core.generic import ( NDFrame, - _shared_docs, ) from pandas.core.groupby.groupby import ( BaseGroupBy, GroupBy, - _pipe_template, get_groupby, ) from pandas.core.groupby.grouper import Grouper @@ -114,6 +106,7 @@ DataFrame, Series, ) + from pandas.core.generic import NDFrame _shared_docs_kwargs: dict[str, str] = {} @@ -263,100 +256,183 @@ def pipe( ) -> T: ... @final - @Substitution( - klass="Resampler", - examples=""" - >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, - ... index=pd.date_range('2012-08-02', periods=4)) - >>> df - A - 2012-08-02 1 - 2012-08-03 2 - 2012-08-04 3 - 2012-08-05 4 - - To get the difference between each 2-day period's maximum and minimum - value in one pass, you can do - - >>> df.resample('2D').pipe(lambda x: x.max() - x.min()) - A - 2012-08-02 1 - 2012-08-04 1""", - ) - @Appender(_pipe_template) def pipe( self, func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str], *args: Any, **kwargs: Any, ) -> T: - return super().pipe(func, *args, **kwargs) - - _agg_see_also_doc = dedent( """ - See Also - -------- - DataFrame.groupby.aggregate : Aggregate using callable, string, dict, - or list of string/callables. - DataFrame.resample.transform : Transforms the Series on each group - based on the given function. - DataFrame.aggregate: Aggregate using one or more - operations over the specified axis. - """ - ) + Apply a ``func`` with arguments to this Resampler object and return its result. - _agg_examples_doc = dedent( - """ - Examples - -------- - >>> s = pd.Series([1, 2, 3, 4, 5], - ... index=pd.date_range('20130101', periods=5, freq='s')) - >>> s - 2013-01-01 00:00:00 1 - 2013-01-01 00:00:01 2 - 2013-01-01 00:00:02 3 - 2013-01-01 00:00:03 4 - 2013-01-01 00:00:04 5 - Freq: s, dtype: int64 - - >>> r = s.resample('2s') - - >>> r.agg("sum") - 2013-01-01 00:00:00 3 - 2013-01-01 00:00:02 7 - 2013-01-01 00:00:04 5 - Freq: 2s, dtype: int64 - - >>> r.agg(['sum', 'mean', 'max']) - sum mean max - 2013-01-01 00:00:00 3 1.5 2 - 2013-01-01 00:00:02 7 3.5 4 - 2013-01-01 00:00:04 5 5.0 5 - - >>> r.agg({'result': lambda x: x.mean() / x.std(), - ... 'total': "sum"}) - result total - 2013-01-01 00:00:00 2.121320 3 - 2013-01-01 00:00:02 4.949747 7 - 2013-01-01 00:00:04 NaN 5 - - >>> r.agg(average="mean", total="sum") - average total - 2013-01-01 00:00:00 1.5 3 - 2013-01-01 00:00:02 3.5 7 - 2013-01-01 00:00:04 5.0 5 - """ - ) + Use `.pipe` when you want to improve readability by chaining together + functions that expect Series, DataFrames, GroupBy or Resampler objects. + Instead of writing + + >>> h = lambda x, arg2, arg3: x + 1 - arg2 * arg3 + >>> g = lambda x, arg1: x * 5 / arg1 + >>> f = lambda x: x**4 + >>> df = pd.DataFrame([["a", 4], ["b", 5]], columns=["group", "value"]) + >>> h(g(f(df.groupby("group")), arg1=1), arg2=2, arg3=3) # doctest: +SKIP + + You can write + + >>> ( + ... df.groupby("group").pipe(f).pipe(g, arg1=1).pipe(h, arg2=2, arg3=3) + ... ) # doctest: +SKIP + + which is much more readable. + + Parameters + ---------- + func : callable or tuple of (callable, str) + Function to apply to this Resampler object or, alternatively, + a `(callable, data_keyword)` tuple where `data_keyword` is a + string indicating the keyword of `callable` that expects the + Resampler object. + *args : iterable, optional + Positional arguments passed into `func`. + **kwargs : dict, optional + A dictionary of keyword arguments passed into `func`. + + Returns + ------- + any + The result of applying ``func`` to the Resampler object. + + See Also + -------- + Series.pipe : Apply a function with arguments to a series. + DataFrame.pipe: Apply a function with arguments to a dataframe. + apply : Apply function to each group instead of to the + full Resampler object. + + Notes + ----- + See more `here + `_ + + Examples + -------- + >>> df = pd.DataFrame( + ... {"A": [1, 2, 3, 4]}, index=pd.date_range("2012-08-02", periods=4) + ... ) + >>> df + A + 2012-08-02 1 + 2012-08-03 2 + 2012-08-04 3 + 2012-08-05 4 + + To get the difference between each 2-day period's maximum and minimum + value in one pass, you can do + + >>> df.resample("2D").pipe(lambda x: x.max() - x.min()) + A + 2012-08-02 1 + 2012-08-04 1 + """ + return super().pipe(func, *args, **kwargs) @final - @doc( - _shared_docs["aggregate"], - see_also=_agg_see_also_doc, - examples=_agg_examples_doc, - klass="DataFrame", - axis="", - ) def aggregate(self, func=None, *args, **kwargs): + """ + Aggregate using one or more operations over the specified axis. + + Parameters + ---------- + func : function, str, list or dict + Function to use for aggregating the data. If a function, must either + work when passed a DataFrame or when passed to DataFrame.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` + - dict of axis labels -> functions, function names or list of such. + *args + Positional arguments to pass to `func`. + **kwargs + Keyword arguments to pass to `func`. + + Returns + ------- + scalar, Series or DataFrame + + The return can be: + + * scalar : when Series.agg is called with single function + * Series : when DataFrame.agg is called with a single function + * DataFrame : when DataFrame.agg is called with several functions + + See Also + -------- + DataFrame.groupby.aggregate : Aggregate using callable, string, dict, + or list of string/callables. + DataFrame.resample.transform : Transforms the Series on each group + based on the given function. + DataFrame.aggregate: Aggregate using one or more + operations over the specified axis. + + Notes + ----- + The aggregation operations are always performed over an axis, either the + index (default) or the column axis. This behavior is different from + `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`, + `var`), where the default is to compute the aggregation of the flattened + array, e.g., ``numpy.mean(arr_2d)`` as opposed to + ``numpy.mean(arr_2d, axis=0)``. + + `agg` is an alias for `aggregate`. Use the alias. + + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` + for more details. + + A passed user-defined-function will be passed a Series for evaluation. + + If ``func`` defines an index relabeling, ``axis`` must be ``0`` or ``index``. + + Examples + -------- + >>> s = pd.Series( + ... [1, 2, 3, 4, 5], index=pd.date_range("20130101", periods=5, freq="s") + ... ) + >>> s + 2013-01-01 00:00:00 1 + 2013-01-01 00:00:01 2 + 2013-01-01 00:00:02 3 + 2013-01-01 00:00:03 4 + 2013-01-01 00:00:04 5 + Freq: s, dtype: int64 + + >>> r = s.resample("2s") + + >>> r.agg("sum") + 2013-01-01 00:00:00 3 + 2013-01-01 00:00:02 7 + 2013-01-01 00:00:04 5 + Freq: 2s, dtype: int64 + + >>> r.agg(["sum", "mean", "max"]) + sum mean max + 2013-01-01 00:00:00 3 1.5 2 + 2013-01-01 00:00:02 7 3.5 4 + 2013-01-01 00:00:04 5 5.0 5 + + >>> r.agg({"result": lambda x: x.mean() / x.std(), "total": "sum"}) + result total + 2013-01-01 00:00:00 2.121320 3 + 2013-01-01 00:00:02 4.949747 7 + 2013-01-01 00:00:04 NaN 5 + + >>> r.agg(average="mean", total="sum") + average total + 2013-01-01 00:00:00 1.5 3 + 2013-01-01 00:00:02 3.5 7 + 2013-01-01 00:00:04 5.0 5 + """ result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg() if result is None: how = func @@ -1236,25 +1312,107 @@ def max( return self._downsample("max", numeric_only=numeric_only, min_count=min_count) @final - @doc(GroupBy.first) def first( self, numeric_only: bool = False, min_count: int = 0, skipna: bool = True, ): + """ + Compute the first non-null entry of each column. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values. If an entire group is NA, the result will be NA. + + Returns + ------- + Series or DataFrame + First values within each group. + + See Also + -------- + core.resample.Resampler.last : Compute the last non-null value in each group. + core.resample.Resampler.mean : Compute mean of groups, excluding missing values. + + Examples + -------- + >>> s = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"] + ... ), + ... ) + >>> s + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + >>> s.resample("MS").first() + 2023-01-01 1 + 2023-02-01 3 + Freq: MS, dtype: int64 + """ return self._downsample( "first", numeric_only=numeric_only, min_count=min_count, skipna=skipna ) @final - @doc(GroupBy.last) def last( self, numeric_only: bool = False, min_count: int = 0, skipna: bool = True, ): + """ + Compute the last non-null entry of each column. + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values. If an entire group is NA, the result will be NA. + + Returns + ------- + Series or DataFrame + Last of values within each group. + + See Also + -------- + core.resample.Resampler.first : Compute the first non-null value in each group. + core.resample.Resampler.mean : Compute mean of groups, excluding missing values. + + Examples + -------- + >>> s = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"] + ... ), + ... ) + >>> s + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + >>> s.resample("MS").last() + 2023-01-01 2 + 2023-02-01 4 + Freq: MS, dtype: int64 + """ return self._downsample( "last", numeric_only=numeric_only, min_count=min_count, skipna=skipna ) @@ -1544,8 +1702,41 @@ def sem( return self._downsample("sem", ddof=ddof, numeric_only=numeric_only) @final - @doc(GroupBy.ohlc) def ohlc(self): + """ + Compute open, high, low and close values of a group, excluding missing values. + + Returns + ------- + DataFrame + Open, high, low and close values within each group. + + See Also + -------- + DataFrame.agg : Aggregate using one or more operations over the specified axis. + DataFrame.resample : Resample time-series data. + DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns. + + Examples + -------- + >>> ser = pd.Series( + ... [1, 3, 2, 4, 3, 5], + ... index=pd.DatetimeIndex( + ... [ + ... "2023-01-01", + ... "2023-01-10", + ... "2023-01-15", + ... "2023-02-01", + ... "2023-02-10", + ... "2023-02-15", + ... ] + ... ), + ... ) + >>> ser.resample("MS").ohlc() + open high low close + 2023-01-01 1 3 1 2 + 2023-02-01 4 5 3 5 + """ ax = self.ax obj = self._obj_with_exclusions if len(ax) == 0: @@ -1600,8 +1791,37 @@ def nunique(self): return self._downsample("nunique") @final - @doc(GroupBy.size) def size(self): + """ + Compute group sizes. + + Returns + ------- + Series + Number of rows in each group. + + See Also + -------- + Series.groupby : Apply a function groupby to a Series. + DataFrame.groupby : Apply a function groupby to each row + or column of a DataFrame. + + Examples + -------- + >>> ser = pd.Series( + ... [1, 2, 3], + ... index=pd.DatetimeIndex(["2023-01-01", "2023-01-15", "2023-02-01"]), + ... ) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + dtype: int64 + >>> ser.resample("MS").size() + 2023-01-01 2 + 2023-02-01 1 + Freq: MS, dtype: int64 + """ result = self._downsample("size") # If the result is a non-empty DataFrame we stack to get a Series @@ -1620,8 +1840,40 @@ def size(self): return result @final - @doc(GroupBy.count) def count(self): + """ + Compute count of group, excluding missing values. + + Returns + ------- + Series or DataFrame + Count of values within each group. + + See Also + -------- + Series.groupby : Apply a function groupby to a Series. + DataFrame.groupby : Apply a function groupby to each row + or column of a DataFrame. + + Examples + -------- + >>> ser = pd.Series( + ... [1, 2, 3, 4], + ... index=pd.DatetimeIndex( + ... ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"] + ... ), + ... ) + >>> ser + 2023-01-01 1 + 2023-01-15 2 + 2023-02-01 3 + 2023-02-15 4 + dtype: int64 + >>> ser.resample("MS").count() + 2023-01-01 2 + 2023-02-01 2 + Freq: MS, dtype: int64 + """ result = self._downsample("count") if not len(self.ax): if self._selected_obj.ndim == 1: