diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 849b9d45da5ad..e574554c8b7b5 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -859,7 +859,7 @@ Performance improvements - Performance improvement in :meth:`Series.isin` for nullable data types (:issue:`38340`) - Performance improvement in :meth:`DataFrame.fillna` with ``method="pad|backfill"`` for nullable floating and nullable integer dtypes (:issue:`39953`) - Performance improvement in :meth:`DataFrame.corr` for ``method=kendall`` (:issue:`28329`) -- Performance improvement in :meth:`DataFrame.corr` for ``method=spearman`` (:issue:`40956`) +- Performance improvement in :meth:`DataFrame.corr` for ``method=spearman`` (:issue:`40956`, :issue:`41885`) - Performance improvement in :meth:`.Rolling.corr` and :meth:`.Rolling.cov` (:issue:`39388`) - Performance improvement in :meth:`.RollingGroupby.corr`, :meth:`.ExpandingGroupby.corr`, :meth:`.ExpandingGroupby.corr` and :meth:`.ExpandingGroupby.cov` (:issue:`39591`) - Performance improvement in :func:`unique` for object data type (:issue:`37615`) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index f2efeedb80d4d..5da1e3921ccc1 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -387,6 +387,7 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr float64_t[::1] maskedx, maskedy ndarray[uint8_t, ndim=2] mask int64_t nobs = 0 + bint no_nans float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor const int64_t[:] labels_n, labels_nobs @@ -394,8 +395,15 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr # For compatibility when calling rank_1d labels_n = np.zeros(N, dtype=np.int64) + # Handle the edge case where we know all results will be nan + # to keep conditional logic inside loop simpler + if N < minp: + result = np.full((K, K), np.nan, dtype=np.float64) + return result + result = np.empty((K, K), dtype=np.float64) mask = np.isfinite(mat).view(np.uint8) + no_nans = mask.all() ranked_mat = np.empty((N, K), dtype=np.float64) @@ -409,51 +417,66 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr with nogil: for xi in range(K): for yi in range(xi + 1): - nobs = 0 - # Keep track of whether we need to recompute ranks - all_ranks = True - for i in range(N): - all_ranks &= not (mask[i, xi] ^ mask[i, yi]) - if mask[i, xi] and mask[i, yi]: - maskedx[nobs] = ranked_mat[i, xi] - maskedy[nobs] = ranked_mat[i, yi] - nobs += 1 - - if nobs < minp: - result[xi, yi] = result[yi, xi] = NaN - else: - if not all_ranks: - with gil: - # We need to slice back to nobs because rank_1d will - # require arrays of nobs length - labels_nobs = np.zeros(nobs, dtype=np.int64) - rankedx = rank_1d(np.array(maskedx)[:nobs], - labels=labels_nobs) - rankedy = rank_1d(np.array(maskedy)[:nobs], - labels=labels_nobs) - for i in range(nobs): - maskedx[i] = rankedx[i] - maskedy[i] = rankedy[i] + sumx = sumxx = sumyy = 0 - mean = (nobs + 1) / 2. + # Fastpath for data with no nans/infs, allows avoiding mask checks + # and array reassignments + if no_nans: + mean = (N + 1) / 2. # now the cov numerator - sumx = sumxx = sumyy = 0 - - for i in range(nobs): - vx = maskedx[i] - mean - vy = maskedy[i] - mean + for i in range(N): + vx = ranked_mat[i, xi] - mean + vy = ranked_mat[i, yi] - mean sumx += vx * vy sumxx += vx * vx sumyy += vy * vy + else: + nobs = 0 + # Keep track of whether we need to recompute ranks + all_ranks = True + for i in range(N): + all_ranks &= not (mask[i, xi] ^ mask[i, yi]) + if mask[i, xi] and mask[i, yi]: + maskedx[nobs] = ranked_mat[i, xi] + maskedy[nobs] = ranked_mat[i, yi] + nobs += 1 + + if nobs < minp: + result[xi, yi] = result[yi, xi] = NaN + continue + else: + if not all_ranks: + with gil: + # We need to slice back to nobs because rank_1d will + # require arrays of nobs length + labels_nobs = np.zeros(nobs, dtype=np.int64) + rankedx = rank_1d(np.array(maskedx)[:nobs], + labels=labels_nobs) + rankedy = rank_1d(np.array(maskedy)[:nobs], + labels=labels_nobs) + for i in range(nobs): + maskedx[i] = rankedx[i] + maskedy[i] = rankedy[i] + + mean = (nobs + 1) / 2. + + # now the cov numerator + for i in range(nobs): + vx = maskedx[i] - mean + vy = maskedy[i] - mean - divisor = sqrt(sumxx * sumyy) + sumx += vx * vy + sumxx += vx * vx + sumyy += vy * vy - if divisor != 0: - result[xi, yi] = result[yi, xi] = sumx / divisor - else: - result[xi, yi] = result[yi, xi] = NaN + divisor = sqrt(sumxx * sumyy) + + if divisor != 0: + result[xi, yi] = result[yi, xi] = sumx / divisor + else: + result[xi, yi] = result[yi, xi] = NaN return result diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 3da3d82ec77f9..3a5e621a05530 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -232,6 +232,16 @@ def test_calc_corr_small_numbers(self): expected = DataFrame({"A": [1.0, 1.0], "B": [1.0, 1.0]}, index=["A", "B"]) tm.assert_frame_equal(result, expected) + @td.skip_if_no_scipy + @pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"]) + def test_corr_min_periods_greater_than_length(self, method): + df = DataFrame({"A": [1, 2], "B": [1, 2]}) + result = df.corr(method=method, min_periods=3) + expected = DataFrame( + {"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"] + ) + tm.assert_frame_equal(result, expected) + class TestDataFrameCorrWith: def test_corrwith(self, datetime_frame):