Merge branch 'main' into pandas.NamedAgg

pandas-dev · Jun 17, 2024 · 3fa6026 · 3fa6026
2 parents 04d8311 + dd87dd3
commit 3fa6026
Show file tree

Hide file tree

Showing 34 changed files with 680 additions and 327 deletions.
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -84,7 +84,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.MultiIndex.get_loc_level PR07" \
         -i "pandas.MultiIndex.levshape SA01" \
         -i "pandas.MultiIndex.names SA01" \
-        -i "pandas.MultiIndex.nlevels SA01" \
         -i "pandas.MultiIndex.remove_unused_levels RT03,SA01" \
         -i "pandas.MultiIndex.reorder_levels RT03,SA01" \
         -i "pandas.MultiIndex.set_levels RT03,SA01" \
@@ -464,7 +463,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.io.stata.StataReader.variable_labels RT03,SA01" \
         -i "pandas.io.stata.StataWriter.write_file SA01" \
         -i "pandas.json_normalize RT03,SA01" \
-        -i "pandas.merge PR07" \
         -i "pandas.merge_asof PR07,RT03" \
         -i "pandas.period_range RT03,SA01" \
         -i "pandas.plotting.andrews_curves RT03,SA01" \

diff --git a/doc/data/titanic.csv b/doc/data/titanic.csv
diff --git a/doc/source/_static/schemas/01_table_spreadsheet.png b/doc/source/_static/schemas/01_table_spreadsheet.png
diff --git a/doc/source/development/policies.rst b/doc/source/development/policies.rst
@@ -51,7 +51,7 @@ pandas may change the behavior of experimental features at any time.
 Python support
 ~~~~~~~~~~~~~~
 
-pandas mirrors the `NumPy guidelines for Python support <https://numpy.org/neps/nep-0029-deprecation_policy.html#implementation>`__.
+pandas mirrors the `SPEC 0 guideline for Python support <https://scientific-python.org/specs/spec-0000>`__.
 
 Security policy
 ~~~~~~~~~~~~~~~

diff --git a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst
@@ -46,7 +46,7 @@ I want to store passenger data of the Titanic. For a number of passengers, I kno
             "Name": [
                 "Braund, Mr. Owen Harris",
                 "Allen, Mr. William Henry",
-                "Bonnell, Miss. Elizabeth",
+                "Bonnell, Miss Elizabeth",
             ],
             "Age": [22, 35, 58],
             "Sex": ["male", "male", "female"],

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -571,6 +571,7 @@ Groupby/resample/rolling
 - Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`)
 - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`)
 - Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`)
+- Bug in :meth:`DataFrameGroupBy.cumsum` where it did not return the correct dtype when the label contained ``None``. (:issue:`58811`)
 - Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`)
 - Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`)
 
@@ -596,8 +597,10 @@ Styler
 Other
 ^^^^^
 - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`)
+- Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`)
 - Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`)
 - Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`)
+- Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`)
 - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`)
 - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`)
 - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`)

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -399,7 +399,12 @@ def group_cumsum(
             lab = labels[i]
 
             if lab < 0:
+                # GH#58811
+                if uses_mask:
+                    result_mask[i, :] = True
+                    out[i, :] = 0
                 continue
+
             for j in range(K):
                 val = values[i, j]
 

diff --git a/pandas/core/_numba/executor.py b/pandas/core/_numba/executor.py
@@ -14,17 +14,19 @@
 
 from pandas.compat._optional import import_optional_dependency
 
+from pandas.core.util.numba_ import jit_user_function
+
 
 @functools.cache
 def generate_apply_looper(func, nopython=True, nogil=True, parallel=False):
     if TYPE_CHECKING:
         import numba
     else:
         numba = import_optional_dependency("numba")
-    nb_compat_func = numba.extending.register_jitable(func)
+    nb_compat_func = jit_user_function(func)
 
     @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
-    def nb_looper(values, axis):
+    def nb_looper(values, axis, *args):
         # Operate on the first row/col in order to get
         # the output shape
         if axis == 0:
@@ -33,7 +35,7 @@ def nb_looper(values, axis):
         else:
             first_elem = values[0]
             dim0 = values.shape[0]
-        res0 = nb_compat_func(first_elem)
+        res0 = nb_compat_func(first_elem, *args)
         # Use np.asarray to get shape for
         # https://github.com/numba/numba/issues/4202#issuecomment-1185981507
         buf_shape = (dim0,) + np.atleast_1d(np.asarray(res0)).shape
@@ -44,11 +46,11 @@ def nb_looper(values, axis):
         if axis == 1:
             buff[0] = res0
             for i in numba.prange(1, values.shape[0]):
-                buff[i] = nb_compat_func(values[i])
+                buff[i] = nb_compat_func(values[i], *args)
         else:
             buff[:, 0] = res0
             for j in numba.prange(1, values.shape[1]):
-                buff[:, j] = nb_compat_func(values[:, j])
+                buff[:, j] = nb_compat_func(values[:, j], *args)
         return buff
 
     return nb_looper

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -51,6 +51,10 @@
 from pandas.core._numba.executor import generate_apply_looper
 import pandas.core.common as com
 from pandas.core.construction import ensure_wrapped_if_datetimelike
+from pandas.core.util.numba_ import (
+    get_jit_arguments,
+    prepare_function_arguments,
+)
 
 if TYPE_CHECKING:
     from collections.abc import (
@@ -70,7 +74,6 @@
     from pandas.core.resample import Resampler
     from pandas.core.window.rolling import BaseWindow
 
-
 ResType = dict[int, Any]
 
 
@@ -997,17 +1000,20 @@ def wrapper(*args, **kwargs):
             return wrapper
 
         if engine == "numba":
-            engine_kwargs = {} if engine_kwargs is None else engine_kwargs
-
+            args, kwargs = prepare_function_arguments(
+                self.func,  # type: ignore[arg-type]
+                self.args,
+                self.kwargs,
+            )
             # error: Argument 1 to "__call__" of "_lru_cache_wrapper" has
             # incompatible type "Callable[..., Any] | str | list[Callable
             # [..., Any] | str] | dict[Hashable,Callable[..., Any] | str |
             # list[Callable[..., Any] | str]]"; expected "Hashable"
             nb_looper = generate_apply_looper(
                 self.func,  # type: ignore[arg-type]
-                **engine_kwargs,
+                **get_jit_arguments(engine_kwargs, kwargs),
             )
-            result = nb_looper(self.values, self.axis)
+            result = nb_looper(self.values, self.axis, *args)
             # If we made the result 2-D, squeeze it back to 1-D
             result = np.squeeze(result)
         else:
@@ -1148,21 +1154,23 @@ def generate_numba_apply_func(
         # Currently the parallel argument doesn't get passed through here
         # (it's disabled) since the dicts in numba aren't thread-safe.
         @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel)
-        def numba_func(values, col_names, df_index):
+        def numba_func(values, col_names, df_index, *args):
             results = {}
             for j in range(values.shape[1]):
                 # Create the series
                 ser = Series(
                     values[:, j], index=df_index, name=maybe_cast_str(col_names[j])
                 )
-                results[j] = jitted_udf(ser)
+                results[j] = jitted_udf(ser, *args)
             return results
 
         return numba_func
 
     def apply_with_numba(self) -> dict[int, Any]:
+        func = cast(Callable, self.func)
+        args, kwargs = prepare_function_arguments(func, self.args, self.kwargs)
         nb_func = self.generate_numba_apply_func(
-            cast(Callable, self.func), **self.engine_kwargs
+            func, **get_jit_arguments(self.engine_kwargs, kwargs)
         )
         from pandas.core._numba.extensions import set_numba_data
 
@@ -1177,7 +1185,7 @@ def apply_with_numba(self) -> dict[int, Any]:
         # Convert from numba dict to regular dict
         # Our isinstance checks in the df constructor don't pass for numbas typed dict
         with set_numba_data(index) as index, set_numba_data(columns) as columns:
-            res = dict(nb_func(self.values, columns, index))
+            res = dict(nb_func(self.values, columns, index, *args))
         return res
 
     @property
@@ -1285,7 +1293,7 @@ def generate_numba_apply_func(
         jitted_udf = numba.extending.register_jitable(func)
 
         @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel)
-        def numba_func(values, col_names_index, index):
+        def numba_func(values, col_names_index, index, *args):
             results = {}
             # Currently the parallel argument doesn't get passed through here
             # (it's disabled) since the dicts in numba aren't thread-safe.
@@ -1297,15 +1305,17 @@ def numba_func(values, col_names_index, index):
                     index=col_names_index,
                     name=maybe_cast_str(index[i]),
                 )
-                results[i] = jitted_udf(ser)
+                results[i] = jitted_udf(ser, *args)
 
             return results
 
         return numba_func
 
     def apply_with_numba(self) -> dict[int, Any]:
+        func = cast(Callable, self.func)
+        args, kwargs = prepare_function_arguments(func, self.args, self.kwargs)
         nb_func = self.generate_numba_apply_func(
-            cast(Callable, self.func), **self.engine_kwargs
+            func, **get_jit_arguments(self.engine_kwargs, kwargs)
         )
 
         from pandas.core._numba.extensions import set_numba_data
@@ -1316,7 +1326,7 @@ def apply_with_numba(self) -> dict[int, Any]:
             set_numba_data(self.obj.index) as index,
             set_numba_data(self.columns) as columns,
         ):
-            res = dict(nb_func(self.values, columns, index))
+            res = dict(nb_func(self.values, columns, index, *args))
 
         return res
 

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -2970,7 +2970,7 @@ def transpose_homogeneous_pyarrow(
     """
     arrays = list(arrays)
     nrows, ncols = len(arrays[0]), len(arrays)
-    indices = np.arange(nrows * ncols).reshape(ncols, nrows).T.flatten()
+    indices = np.arange(nrows * ncols).reshape(ncols, nrows).T.reshape(-1)
     arr = pa.chunked_array([chunk for arr in arrays for chunk in arr._pa_array.chunks])
     arr = arr.take(indices)
     return [ArrowExtensionArray(arr.slice(i * ncols, ncols)) for i in range(nrows)]
diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -2119,6 +2119,32 @@ def isocalendar(self) -> DataFrame:
 
         >>> idx.is_year_start
         array([False, False,  True])
+
+        This method, when applied to Series with datetime values under
+        the ``.dt`` accessor, will lose information about Business offsets.
+
+        >>> dates = pd.Series(pd.date_range("2020-10-30", periods=4, freq="BYS"))
+        >>> dates
+        0   2021-01-01
+        1   2022-01-03
+        2   2023-01-02
+        3   2024-01-01
+        dtype: datetime64[ns]
+
+        >>> dates.dt.is_year_start
+        0    True
+        1    False
+        2    False
+        3    True
+        dtype: bool
+
+        >>> idx = pd.date_range("2020-10-30", periods=4, freq="BYS")
+        >>> idx
+        DatetimeIndex(['2021-01-01', '2022-01-03', '2023-01-02', '2024-01-01'],
+                      dtype='datetime64[ns]', freq='BYS-JAN')
+
+        >>> idx.is_year_start
+        array([ True,  True,  True,  True])
         """,
     )
     is_year_end = _field_accessor(

diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py
@@ -19,6 +19,7 @@
 
 from pandas.core.dtypes.common import (
     is_list_like,
+    is_numeric_dtype,
     is_scalar,
 )
 
@@ -508,10 +509,6 @@ def _disallow_scalar_only_bool_ops(self) -> None:
             raise NotImplementedError("cannot evaluate scalar only bool ops")
 
 
-def isnumeric(dtype) -> bool:
-    return issubclass(np.dtype(dtype).type, np.number)
-
-
 class Div(BinOp):
     """
     Div operator to special case casting.
@@ -525,7 +522,9 @@ class Div(BinOp):
     def __init__(self, lhs, rhs) -> None:
         super().__init__("/", lhs, rhs)
 
-        if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type):
+        if not is_numeric_dtype(lhs.return_type) or not is_numeric_dtype(
+            rhs.return_type
+        ):
             raise TypeError(
                 f"unsupported operand type(s) for {self.op}: "
                 f"'{lhs.return_type}' and '{rhs.return_type}'"

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -9271,7 +9271,9 @@ def compare(
 
         # reorder axis to keep things organized
         indices = (
-            np.arange(diff.shape[axis]).reshape([2, diff.shape[axis] // 2]).T.flatten()
+            np.arange(diff.shape[axis])
+            .reshape([2, diff.shape[axis] // 2])
+            .T.reshape(-1)
         )
         diff = diff.take(indices, axis=axis)
 

diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -1031,6 +1031,13 @@ def nlevels(self) -> int:
         """
         Integer number of levels in this MultiIndex.
 
+        See Also
+        --------
+        MultiIndex.levels : Get the levels of the MultiIndex.
+        MultiIndex.codes : Get the codes of the MultiIndex.
+        MultiIndex.from_arrays : Convert arrays to MultiIndex.
+        MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
+
         Examples
         --------
         >>> mi = pd.MultiIndex.from_arrays([["a"], ["b"], ["c"]])

diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -2440,7 +2440,7 @@ def _align_frame(self, indexer, df: DataFrame) -> DataFrame:
                 ax = self.obj.axes[i]
                 if is_sequence(ix) or isinstance(ix, slice):
                     if isinstance(ix, np.ndarray):
-                        ix = ix.ravel()
+                        ix = ix.reshape(-1)
                     if idx is None:
                         idx = ax[ix]
                     elif cols is None: