Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF: nancorr_spearman fastpath #41885

Merged
merged 4 commits into from
Jun 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -859,7 +859,7 @@ Performance improvements
- Performance improvement in :meth:`Series.isin` for nullable data types (:issue:`38340`)
- Performance improvement in :meth:`DataFrame.fillna` with ``method="pad|backfill"`` for nullable floating and nullable integer dtypes (:issue:`39953`)
- Performance improvement in :meth:`DataFrame.corr` for ``method=kendall`` (:issue:`28329`)
- Performance improvement in :meth:`DataFrame.corr` for ``method=spearman`` (:issue:`40956`)
- Performance improvement in :meth:`DataFrame.corr` for ``method=spearman`` (:issue:`40956`, :issue:`41885`)
- Performance improvement in :meth:`.Rolling.corr` and :meth:`.Rolling.cov` (:issue:`39388`)
- Performance improvement in :meth:`.RollingGroupby.corr`, :meth:`.ExpandingGroupby.corr`, :meth:`.ExpandingGroupby.corr` and :meth:`.ExpandingGroupby.cov` (:issue:`39591`)
- Performance improvement in :func:`unique` for object data type (:issue:`37615`)
Expand Down
95 changes: 59 additions & 36 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -387,15 +387,23 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
float64_t[::1] maskedx, maskedy
ndarray[uint8_t, ndim=2] mask
int64_t nobs = 0
bint no_nans
float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor
const int64_t[:] labels_n, labels_nobs

N, K = (<object>mat).shape
# For compatibility when calling rank_1d
labels_n = np.zeros(N, dtype=np.int64)

# Handle the edge case where we know all results will be nan
# to keep conditional logic inside loop simpler
if N < minp:
result = np.full((K, K), np.nan, dtype=np.float64)
return result

result = np.empty((K, K), dtype=np.float64)
mask = np.isfinite(mat).view(np.uint8)
no_nans = mask.all()

ranked_mat = np.empty((N, K), dtype=np.float64)

Expand All @@ -409,51 +417,66 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr
with nogil:
for xi in range(K):
for yi in range(xi + 1):
nobs = 0
# Keep track of whether we need to recompute ranks
all_ranks = True
for i in range(N):
all_ranks &= not (mask[i, xi] ^ mask[i, yi])
if mask[i, xi] and mask[i, yi]:
maskedx[nobs] = ranked_mat[i, xi]
maskedy[nobs] = ranked_mat[i, yi]
nobs += 1

if nobs < minp:
result[xi, yi] = result[yi, xi] = NaN
else:
if not all_ranks:
with gil:
# We need to slice back to nobs because rank_1d will
# require arrays of nobs length
labels_nobs = np.zeros(nobs, dtype=np.int64)
rankedx = rank_1d(np.array(maskedx)[:nobs],
labels=labels_nobs)
rankedy = rank_1d(np.array(maskedy)[:nobs],
labels=labels_nobs)
for i in range(nobs):
maskedx[i] = rankedx[i]
maskedy[i] = rankedy[i]
sumx = sumxx = sumyy = 0

mean = (nobs + 1) / 2.
# Fastpath for data with no nans/infs, allows avoiding mask checks
# and array reassignments
if no_nans:
mean = (N + 1) / 2.

# now the cov numerator
sumx = sumxx = sumyy = 0

for i in range(nobs):
vx = maskedx[i] - mean
vy = maskedy[i] - mean
for i in range(N):
vx = ranked_mat[i, xi] - mean
vy = ranked_mat[i, yi] - mean

sumx += vx * vy
sumxx += vx * vx
sumyy += vy * vy
else:
nobs = 0
# Keep track of whether we need to recompute ranks
all_ranks = True
for i in range(N):
all_ranks &= not (mask[i, xi] ^ mask[i, yi])
if mask[i, xi] and mask[i, yi]:
maskedx[nobs] = ranked_mat[i, xi]
maskedy[nobs] = ranked_mat[i, yi]
nobs += 1

if nobs < minp:
result[xi, yi] = result[yi, xi] = NaN
continue
else:
if not all_ranks:
with gil:
# We need to slice back to nobs because rank_1d will
# require arrays of nobs length
labels_nobs = np.zeros(nobs, dtype=np.int64)
rankedx = rank_1d(np.array(maskedx)[:nobs],
labels=labels_nobs)
rankedy = rank_1d(np.array(maskedy)[:nobs],
labels=labels_nobs)
for i in range(nobs):
maskedx[i] = rankedx[i]
maskedy[i] = rankedy[i]

mean = (nobs + 1) / 2.

# now the cov numerator
for i in range(nobs):
vx = maskedx[i] - mean
vy = maskedy[i] - mean

divisor = sqrt(sumxx * sumyy)
sumx += vx * vy
sumxx += vx * vx
sumyy += vy * vy

if divisor != 0:
result[xi, yi] = result[yi, xi] = sumx / divisor
else:
result[xi, yi] = result[yi, xi] = NaN
divisor = sqrt(sumxx * sumyy)

if divisor != 0:
result[xi, yi] = result[yi, xi] = sumx / divisor
else:
result[xi, yi] = result[yi, xi] = NaN

return result

Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/frame/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,16 @@ def test_calc_corr_small_numbers(self):
expected = DataFrame({"A": [1.0, 1.0], "B": [1.0, 1.0]}, index=["A", "B"])
tm.assert_frame_equal(result, expected)

@td.skip_if_no_scipy
@pytest.mark.parametrize("method", ["pearson", "spearman", "kendall"])
def test_corr_min_periods_greater_than_length(self, method):
df = DataFrame({"A": [1, 2], "B": [1, 2]})
result = df.corr(method=method, min_periods=3)
expected = DataFrame(
{"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"]
)
tm.assert_frame_equal(result, expected)


class TestDataFrameCorrWith:
def test_corrwith(self, datetime_frame):
Expand Down