Skip to content

Commit

Permalink
Merge branch 'main' into pandas.NamedAgg
Browse files Browse the repository at this point in the history
  • Loading branch information
tuhinsharma121 committed Jun 17, 2024
2 parents 04d8311 + dd87dd3 commit 3fa6026
Show file tree
Hide file tree
Showing 34 changed files with 680 additions and 327 deletions.
2 changes: 0 additions & 2 deletions ci/code_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.MultiIndex.get_loc_level PR07" \
-i "pandas.MultiIndex.levshape SA01" \
-i "pandas.MultiIndex.names SA01" \
-i "pandas.MultiIndex.nlevels SA01" \
-i "pandas.MultiIndex.remove_unused_levels RT03,SA01" \
-i "pandas.MultiIndex.reorder_levels RT03,SA01" \
-i "pandas.MultiIndex.set_levels RT03,SA01" \
Expand Down Expand Up @@ -464,7 +463,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.io.stata.StataReader.variable_labels RT03,SA01" \
-i "pandas.io.stata.StataWriter.write_file SA01" \
-i "pandas.json_normalize RT03,SA01" \
-i "pandas.merge PR07" \
-i "pandas.merge_asof PR07,RT03" \
-i "pandas.period_range RT03,SA01" \
-i "pandas.plotting.andrews_curves RT03,SA01" \
Expand Down
444 changes: 222 additions & 222 deletions doc/data/titanic.csv

Large diffs are not rendered by default.

Binary file modified doc/source/_static/schemas/01_table_spreadsheet.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion doc/source/development/policies.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ pandas may change the behavior of experimental features at any time.
Python support
~~~~~~~~~~~~~~

pandas mirrors the `NumPy guidelines for Python support <https://numpy.org/neps/nep-0029-deprecation_policy.html#implementation>`__.
pandas mirrors the `SPEC 0 guideline for Python support <https://scientific-python.org/specs/spec-0000>`__.

Security policy
~~~~~~~~~~~~~~~
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ I want to store passenger data of the Titanic. For a number of passengers, I kno
"Name": [
"Braund, Mr. Owen Harris",
"Allen, Mr. William Henry",
"Bonnell, Miss. Elizabeth",
"Bonnell, Miss Elizabeth",
],
"Age": [22, 35, 58],
"Sex": ["male", "male", "female"],
Expand Down
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,7 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`)
- Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`)
- Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`)
- Bug in :meth:`DataFrameGroupBy.cumsum` where it did not return the correct dtype when the label contained ``None``. (:issue:`58811`)
- Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`)
- Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`)

Expand All @@ -596,8 +597,10 @@ Styler
Other
^^^^^
- Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`)
- Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`)
- Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`)
- Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`)
- Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`)
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`)
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`)
- Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`)
Expand Down
5 changes: 5 additions & 0 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,12 @@ def group_cumsum(
lab = labels[i]

if lab < 0:
# GH#58811
if uses_mask:
result_mask[i, :] = True
out[i, :] = 0
continue

for j in range(K):
val = values[i, j]

Expand Down
12 changes: 7 additions & 5 deletions pandas/core/_numba/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,19 @@

from pandas.compat._optional import import_optional_dependency

from pandas.core.util.numba_ import jit_user_function


@functools.cache
def generate_apply_looper(func, nopython=True, nogil=True, parallel=False):
if TYPE_CHECKING:
import numba
else:
numba = import_optional_dependency("numba")
nb_compat_func = numba.extending.register_jitable(func)
nb_compat_func = jit_user_function(func)

@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def nb_looper(values, axis):
def nb_looper(values, axis, *args):
# Operate on the first row/col in order to get
# the output shape
if axis == 0:
Expand All @@ -33,7 +35,7 @@ def nb_looper(values, axis):
else:
first_elem = values[0]
dim0 = values.shape[0]
res0 = nb_compat_func(first_elem)
res0 = nb_compat_func(first_elem, *args)
# Use np.asarray to get shape for
# https://github.com/numba/numba/issues/4202#issuecomment-1185981507
buf_shape = (dim0,) + np.atleast_1d(np.asarray(res0)).shape
Expand All @@ -44,11 +46,11 @@ def nb_looper(values, axis):
if axis == 1:
buff[0] = res0
for i in numba.prange(1, values.shape[0]):
buff[i] = nb_compat_func(values[i])
buff[i] = nb_compat_func(values[i], *args)
else:
buff[:, 0] = res0
for j in numba.prange(1, values.shape[1]):
buff[:, j] = nb_compat_func(values[:, j])
buff[:, j] = nb_compat_func(values[:, j], *args)
return buff

return nb_looper
Expand Down
36 changes: 23 additions & 13 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@
from pandas.core._numba.executor import generate_apply_looper
import pandas.core.common as com
from pandas.core.construction import ensure_wrapped_if_datetimelike
from pandas.core.util.numba_ import (
get_jit_arguments,
prepare_function_arguments,
)

if TYPE_CHECKING:
from collections.abc import (
Expand All @@ -70,7 +74,6 @@
from pandas.core.resample import Resampler
from pandas.core.window.rolling import BaseWindow


ResType = dict[int, Any]


Expand Down Expand Up @@ -997,17 +1000,20 @@ def wrapper(*args, **kwargs):
return wrapper

if engine == "numba":
engine_kwargs = {} if engine_kwargs is None else engine_kwargs

args, kwargs = prepare_function_arguments(
self.func, # type: ignore[arg-type]
self.args,
self.kwargs,
)
# error: Argument 1 to "__call__" of "_lru_cache_wrapper" has
# incompatible type "Callable[..., Any] | str | list[Callable
# [..., Any] | str] | dict[Hashable,Callable[..., Any] | str |
# list[Callable[..., Any] | str]]"; expected "Hashable"
nb_looper = generate_apply_looper(
self.func, # type: ignore[arg-type]
**engine_kwargs,
**get_jit_arguments(engine_kwargs, kwargs),
)
result = nb_looper(self.values, self.axis)
result = nb_looper(self.values, self.axis, *args)
# If we made the result 2-D, squeeze it back to 1-D
result = np.squeeze(result)
else:
Expand Down Expand Up @@ -1148,21 +1154,23 @@ def generate_numba_apply_func(
# Currently the parallel argument doesn't get passed through here
# (it's disabled) since the dicts in numba aren't thread-safe.
@numba.jit(nogil=nogil, nopython=nopython, parallel=parallel)
def numba_func(values, col_names, df_index):
def numba_func(values, col_names, df_index, *args):
results = {}
for j in range(values.shape[1]):
# Create the series
ser = Series(
values[:, j], index=df_index, name=maybe_cast_str(col_names[j])
)
results[j] = jitted_udf(ser)
results[j] = jitted_udf(ser, *args)
return results

return numba_func

def apply_with_numba(self) -> dict[int, Any]:
func = cast(Callable, self.func)
args, kwargs = prepare_function_arguments(func, self.args, self.kwargs)
nb_func = self.generate_numba_apply_func(
cast(Callable, self.func), **self.engine_kwargs
func, **get_jit_arguments(self.engine_kwargs, kwargs)
)
from pandas.core._numba.extensions import set_numba_data

Expand All @@ -1177,7 +1185,7 @@ def apply_with_numba(self) -> dict[int, Any]:
# Convert from numba dict to regular dict
# Our isinstance checks in the df constructor don't pass for numbas typed dict
with set_numba_data(index) as index, set_numba_data(columns) as columns:
res = dict(nb_func(self.values, columns, index))
res = dict(nb_func(self.values, columns, index, *args))
return res

@property
Expand Down Expand Up @@ -1285,7 +1293,7 @@ def generate_numba_apply_func(
jitted_udf = numba.extending.register_jitable(func)

@numba.jit(nogil=nogil, nopython=nopython, parallel=parallel)
def numba_func(values, col_names_index, index):
def numba_func(values, col_names_index, index, *args):
results = {}
# Currently the parallel argument doesn't get passed through here
# (it's disabled) since the dicts in numba aren't thread-safe.
Expand All @@ -1297,15 +1305,17 @@ def numba_func(values, col_names_index, index):
index=col_names_index,
name=maybe_cast_str(index[i]),
)
results[i] = jitted_udf(ser)
results[i] = jitted_udf(ser, *args)

return results

return numba_func

def apply_with_numba(self) -> dict[int, Any]:
func = cast(Callable, self.func)
args, kwargs = prepare_function_arguments(func, self.args, self.kwargs)
nb_func = self.generate_numba_apply_func(
cast(Callable, self.func), **self.engine_kwargs
func, **get_jit_arguments(self.engine_kwargs, kwargs)
)

from pandas.core._numba.extensions import set_numba_data
Expand All @@ -1316,7 +1326,7 @@ def apply_with_numba(self) -> dict[int, Any]:
set_numba_data(self.obj.index) as index,
set_numba_data(self.columns) as columns,
):
res = dict(nb_func(self.values, columns, index))
res = dict(nb_func(self.values, columns, index, *args))

return res

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -2970,7 +2970,7 @@ def transpose_homogeneous_pyarrow(
"""
arrays = list(arrays)
nrows, ncols = len(arrays[0]), len(arrays)
indices = np.arange(nrows * ncols).reshape(ncols, nrows).T.flatten()
indices = np.arange(nrows * ncols).reshape(ncols, nrows).T.reshape(-1)
arr = pa.chunked_array([chunk for arr in arrays for chunk in arr._pa_array.chunks])
arr = arr.take(indices)
return [ArrowExtensionArray(arr.slice(i * ncols, ncols)) for i in range(nrows)]
26 changes: 26 additions & 0 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2119,6 +2119,32 @@ def isocalendar(self) -> DataFrame:
>>> idx.is_year_start
array([False, False, True])
This method, when applied to Series with datetime values under
the ``.dt`` accessor, will lose information about Business offsets.
>>> dates = pd.Series(pd.date_range("2020-10-30", periods=4, freq="BYS"))
>>> dates
0 2021-01-01
1 2022-01-03
2 2023-01-02
3 2024-01-01
dtype: datetime64[ns]
>>> dates.dt.is_year_start
0 True
1 False
2 False
3 True
dtype: bool
>>> idx = pd.date_range("2020-10-30", periods=4, freq="BYS")
>>> idx
DatetimeIndex(['2021-01-01', '2022-01-03', '2023-01-02', '2024-01-01'],
dtype='datetime64[ns]', freq='BYS-JAN')
>>> idx.is_year_start
array([ True, True, True, True])
""",
)
is_year_end = _field_accessor(
Expand Down
9 changes: 4 additions & 5 deletions pandas/core/computation/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from pandas.core.dtypes.common import (
is_list_like,
is_numeric_dtype,
is_scalar,
)

Expand Down Expand Up @@ -508,10 +509,6 @@ def _disallow_scalar_only_bool_ops(self) -> None:
raise NotImplementedError("cannot evaluate scalar only bool ops")


def isnumeric(dtype) -> bool:
return issubclass(np.dtype(dtype).type, np.number)


class Div(BinOp):
"""
Div operator to special case casting.
Expand All @@ -525,7 +522,9 @@ class Div(BinOp):
def __init__(self, lhs, rhs) -> None:
super().__init__("/", lhs, rhs)

if not isnumeric(lhs.return_type) or not isnumeric(rhs.return_type):
if not is_numeric_dtype(lhs.return_type) or not is_numeric_dtype(
rhs.return_type
):
raise TypeError(
f"unsupported operand type(s) for {self.op}: "
f"'{lhs.return_type}' and '{rhs.return_type}'"
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9271,7 +9271,9 @@ def compare(

# reorder axis to keep things organized
indices = (
np.arange(diff.shape[axis]).reshape([2, diff.shape[axis] // 2]).T.flatten()
np.arange(diff.shape[axis])
.reshape([2, diff.shape[axis] // 2])
.T.reshape(-1)
)
diff = diff.take(indices, axis=axis)

Expand Down
7 changes: 7 additions & 0 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1031,6 +1031,13 @@ def nlevels(self) -> int:
"""
Integer number of levels in this MultiIndex.
See Also
--------
MultiIndex.levels : Get the levels of the MultiIndex.
MultiIndex.codes : Get the codes of the MultiIndex.
MultiIndex.from_arrays : Convert arrays to MultiIndex.
MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
Examples
--------
>>> mi = pd.MultiIndex.from_arrays([["a"], ["b"], ["c"]])
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2440,7 +2440,7 @@ def _align_frame(self, indexer, df: DataFrame) -> DataFrame:
ax = self.obj.axes[i]
if is_sequence(ix) or isinstance(ix, slice):
if isinstance(ix, np.ndarray):
ix = ix.ravel()
ix = ix.reshape(-1)
if idx is None:
idx = ax[ix]
elif cols is None:
Expand Down
Loading

0 comments on commit 3fa6026

Please sign in to comment.