Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ Other enhancements
- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support Python's new-style format strings (e.g., ``"{:.6f}"``) for the ``float_format`` parameter, in addition to old-style ``%`` format strings and callables. This allows for more flexible and modern formatting of floating point numbers when exporting to CSV. (:issue:`49580`)
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
- :meth:`Series.corr`, :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith` with ``method="kendall"`` and ``method="spearman"`` now work with ordered categorical data types (:issue:`60306`)
- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
- :meth:`Series.map` now accepts an ``engine`` parameter to allow execution with a third-party execution engine (:issue:`61125`)
- :meth:`Series.rank` and :meth:`DataFrame.rank` with numpy-nullable dtypes preserve ``NA`` values and return ``UInt64`` dtype where appropriate instead of casting ``NA`` to ``NaN`` with ``float64`` dtype (:issue:`62043`)
Expand Down
40 changes: 40 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@
from pandas.core.dtypes.dtypes import (
ArrowDtype,
BaseMaskedDtype,
CategoricalDtype,
ExtensionDtype,
)
from pandas.core.dtypes.generic import (
Expand Down Expand Up @@ -11680,6 +11681,10 @@ def corr(
data = self._get_numeric_data() if numeric_only else self
cols = data.columns
idx = cols.copy()

if method in ("spearman", "kendall"):
data = data._transform_ord_cat_cols_to_coded_cols()

mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)

if method == "pearson":
Expand Down Expand Up @@ -11969,6 +11974,8 @@ def corrwith(
correl = num / dom

elif method in ["kendall", "spearman"] or callable(method):
left = left._transform_ord_cat_cols_to_coded_cols()
right = right._transform_ord_cat_cols_to_coded_cols()

def c(x):
return nanops.nancorr(x[0], x[1], method=method)
Expand Down Expand Up @@ -12000,6 +12007,39 @@ def c(x):

return correl

def _transform_ord_cat_cols_to_coded_cols(self) -> DataFrame:
"""
any ordered categorical columns are transformed to the respective
categorical codes while other columns remain untouched
"""
categ = self.select_dtypes("category")
if len(categ.columns) == 0:
return self

data = self.copy(deep=False)
cols_convert = categ.loc[:, categ.agg(lambda x: x.cat.ordered)].columns.unique()
single_cols = [col for col in cols_convert if isinstance(data[col], Series)]
duplicated_cols = [
col for col in cols_convert if isinstance(data[col], DataFrame)
]

if not single_cols and not duplicated_cols:
return self

if single_cols:
data[single_cols] = data[single_cols].apply(
lambda x: x.cat.codes.replace(-1, np.nan)
)

if duplicated_cols:
data[duplicated_cols] = data[duplicated_cols].apply(
lambda x: x.cat.codes.replace(-1, np.nan)
if isinstance(x.dtype, CategoricalDtype) and bool(x.dtype.ordered)
else x
)

return data

# ----------------------------------------------------------------------
# ndarray-like stats methods

Expand Down
6 changes: 6 additions & 0 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2685,6 +2685,12 @@ def corr(
if len(this) == 0:
return np.nan

if method in ("spearman", "kendall"):
if this.dtype == "category" and this.cat.ordered:
this = this.cat.codes.replace(-1, np.nan)
if other.dtype == "category" and other.cat.ordered:
other = other.cat.codes.replace(-1, np.nan)

this_values = this.to_numpy(dtype=float, na_value=np.nan, copy=False)
other_values = other.to_numpy(dtype=float, na_value=np.nan, copy=False)

Expand Down
89 changes: 89 additions & 0 deletions pandas/tests/frame/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from itertools import combinations

import numpy as np
import pytest

Expand Down Expand Up @@ -252,6 +254,46 @@ def test_corr_numeric_only(self, meth, numeric_only):
with pytest.raises(ValueError, match="could not convert string to float"):
df.corr(meth, numeric_only=numeric_only)

@pytest.mark.parametrize("method", ["kendall", "spearman"])
@td.skip_if_no("scipy")
def test_corr_rank_ordered_categorical(
self,
method,
):
df = DataFrame(
{
"ord_cat": Series(
pd.Categorical(
["low", "m", "h", "vh"],
categories=["low", "m", "h", "vh"],
ordered=True,
)
),
"ord_cat_none": Series(
pd.Categorical(
["low", "m", "h", None],
categories=["low", "m", "h"],
ordered=True,
)
),
"ord_int": Series([0, 1, 2, 3]),
"ord_float": Series([2.0, 3.0, 4.5, 6.5]),
"ord_float_nan": Series([2.0, 3.0, 4.5, np.nan]),
"ord_cat_shuff": Series(
pd.Categorical(
["m", "h", "vh", "low"],
categories=["low", "m", "h", "vh"],
ordered=True,
)
),
"ord_int_shuff": Series([2, 3, 0, 1]),
}
)
corr_calc = df.corr(method=method)
for col1, col2 in combinations(df.columns, r=2):
corr_expected = df[col1].corr(df[col2], method=method)
tm.assert_almost_equal(corr_calc[col1][col2], corr_expected)


class TestDataFrameCorrWith:
@pytest.mark.parametrize(
Expand Down Expand Up @@ -493,3 +535,50 @@ def test_cov_with_missing_values(self):
result2 = df.dropna().cov()
tm.assert_frame_equal(result1, expected)
tm.assert_frame_equal(result2, expected)

@pytest.mark.parametrize("method", ["kendall", "spearman"])
def test_corr_rank_ordered_categorical(
self,
method,
):
pytest.importorskip("scipy")
df1 = DataFrame(
{
"a": Series(
pd.Categorical(
["low", "m", "h", "vh"],
categories=["low", "m", "h", "vh"],
ordered=True,
)
),
"b": Series(
pd.Categorical(
["low", "m", "h", None],
categories=["low", "m", "h"],
ordered=True,
)
),
"c": Series([0, 1, 2, 3]),
"d": Series([2.0, 3.0, 4.5, 6.5]),
}
)

df2 = DataFrame(
{
"a": Series([2.0, 3.0, 4.5, np.nan]),
"b": Series(
pd.Categorical(
["m", "h", "vh", "low"],
categories=["low", "m", "h", "vh"],
ordered=True,
)
),
"c": Series([2, 3, 0, 1]),
"d": Series([2.0, 3.0, 4.5, 6.5]),
}
)

corr_calc = df1.corrwith(df2, method=method)
for col in df1.columns:
corr_expected = df1[col].corr(df2[col], method=method)
tm.assert_almost_equal(corr_calc.get(col), corr_expected)
74 changes: 74 additions & 0 deletions pandas/tests/series/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,77 @@ def test_corr_callable_method(self, datetime_series):
df = pd.DataFrame([s1, s2])
expected = pd.DataFrame([{0: 1.0, 1: 0}, {0: 0, 1: 1.0}])
tm.assert_almost_equal(df.transpose().corr(method=my_corr), expected)

@pytest.mark.parametrize("method", ["kendall", "spearman"])
def test_corr_rank_ordered_categorical(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test is pretty long, to the point where its unclear what its intent is. Maybe its worth breaking up into a few tests? Or adding parameterization?

self,
method,
):
stats = pytest.importorskip("scipy.stats")
method_scipy_func = {"kendall": stats.kendalltau, "spearman": stats.spearmanr}
ser_ord_cat = Series(
pd.Categorical(
["low", "med", "high", "very_high"],
categories=["low", "med", "high", "very_high"],
ordered=True,
)
)
ser_ord_cat_codes = ser_ord_cat.cat.codes.replace(-1, np.nan)
ser_ord_int = Series([0, 1, 2, 3])
ser_ord_float = Series([2.0, 3.0, 4.5, 6.5])

corr_calc = ser_ord_cat.corr(ser_ord_int, method=method)
corr_expected = method_scipy_func[method](
ser_ord_cat_codes, ser_ord_int, nan_policy="omit"
)[0]
tm.assert_almost_equal(corr_calc, corr_expected)

corr_calc = ser_ord_cat.corr(ser_ord_float, method=method)
corr_expected = method_scipy_func[method](
ser_ord_cat_codes, ser_ord_float, nan_policy="omit"
)[0]
tm.assert_almost_equal(corr_calc, corr_expected)

corr_calc = ser_ord_cat.corr(ser_ord_cat, method=method)
corr_expected = method_scipy_func[method](
ser_ord_cat_codes, ser_ord_cat_codes, nan_policy="omit"
)[0]
tm.assert_almost_equal(corr_calc, corr_expected)

ser_ord_cat_shuff = Series(
pd.Categorical(
["high", "low", "very_high", "med"],
categories=["low", "med", "high", "very_high"],
ordered=True,
)
)
ser_ord_cat_shuff_codes = ser_ord_cat_shuff.cat.codes.replace(-1, np.nan)

corr_calc = ser_ord_cat_shuff.corr(ser_ord_cat, method=method)
corr_expected = method_scipy_func[method](
ser_ord_cat_shuff_codes, ser_ord_cat_codes, nan_policy="omit"
)[0]
tm.assert_almost_equal(corr_calc, corr_expected)

corr_calc = ser_ord_cat_shuff.corr(ser_ord_cat_shuff, method=method)
corr_expected = method_scipy_func[method](
ser_ord_cat_shuff_codes, ser_ord_cat_shuff_codes, nan_policy="omit"
)[0]
tm.assert_almost_equal(corr_calc, corr_expected)

ser_ord_cat_with_nan = Series(
pd.Categorical(
["h", "low", "vh", None, "m"],
categories=["low", "m", "h", "vh"],
ordered=True,
)
)
ser_ord_cat_shuff_with_nan_codes = ser_ord_cat_with_nan.cat.codes.replace(
-1, np.nan
)
ser_ord_int = Series([2, 0, 1, 3, None])
corr_calc = ser_ord_cat_with_nan.corr(ser_ord_int, method=method)
corr_expected = method_scipy_func[method](
ser_ord_cat_shuff_with_nan_codes, ser_ord_int, nan_policy="omit"
)[0]
tm.assert_almost_equal(corr_calc, corr_expected)
Loading