Skip to content

Commit

Permalink
Backport PR #45646: Revert "PERF: nancorr pearson (#42761)" (#45649)
Browse files Browse the repository at this point in the history
Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
  • Loading branch information
meeseeksmachine and phofl committed Jan 27, 2022
1 parent b241701 commit 2fcb0cd
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 50 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.4.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ Bug fixes

Other
~~~~~
-
- Reverted performance speedup of :meth:`DataFrame.corr` for ``method=pearson`` to fix precision regression (:issue:`45640`, :issue:`42761`)
-

.. ---------------------------------------------------------------------------
Expand Down
62 changes: 13 additions & 49 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -329,12 +329,8 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
Py_ssize_t i, j, xi, yi, N, K
bint minpv
float64_t[:, ::1] result
# Initialize to None since we only use in the no missing value case
float64_t[::1] means=None, ssqds=None
ndarray[uint8_t, ndim=2] mask
bint no_nans
int64_t nobs = 0
float64_t mean, ssqd, val
float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy

N, K = (<object>mat).shape
Expand All @@ -346,57 +342,25 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):

result = np.empty((K, K), dtype=np.float64)
mask = np.isfinite(mat).view(np.uint8)
no_nans = mask.all()

# Computing the online means and variances is expensive - so if possible we can
# precompute these and avoid repeating the computations each time we handle
# an (xi, yi) pair
if no_nans:
means = np.empty(K, dtype=np.float64)
ssqds = np.empty(K, dtype=np.float64)

with nogil:
for j in range(K):
ssqd = mean = 0
for i in range(N):
val = mat[i, j]
dx = val - mean
mean += 1 / (i + 1) * dx
ssqd += (val - mean) * dx

means[j] = mean
ssqds[j] = ssqd

with nogil:
for xi in range(K):
for yi in range(xi + 1):
covxy = 0
if no_nans:
for i in range(N):
# Welford's method for the variance-calculation
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
for i in range(N):
if mask[i, xi] and mask[i, yi]:
vx = mat[i, xi]
vy = mat[i, yi]
covxy += (vx - means[xi]) * (vy - means[yi])

ssqdmx = ssqds[xi]
ssqdmy = ssqds[yi]
nobs = N

else:
nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
for i in range(N):
# Welford's method for the variance-calculation
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
if mask[i, xi] and mask[i, yi]:
vx = mat[i, xi]
vy = mat[i, yi]
nobs += 1
dx = vx - meanx
dy = vy - meany
meanx += 1 / nobs * dx
meany += 1 / nobs * dy
ssqdmx += (vx - meanx) * dx
ssqdmy += (vy - meany) * dy
covxy += (vx - meanx) * dy
nobs += 1
dx = vx - meanx
dy = vy - meany
meanx += 1 / nobs * dx
meany += 1 / nobs * dy
ssqdmx += (vx - meanx) * dx
ssqdmy += (vy - meany) * dy
covxy += (vx - meanx) * dy

if nobs < minpv:
result[xi, yi] = result[yi, xi] = NaN
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/frame/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,13 @@ def test_corrwith_dup_cols(self):
expected = Series(np.ones(4), index=[0, 0, 1, 2])
tm.assert_series_equal(result, expected)

def test_corr_numerical_instabilities(self):
# GH#45640
df = DataFrame([[0.2, 0.4], [0.4, 0.2]])
result = df.corr()
expected = DataFrame({0: [1.0, -1.0], 1: [-1.0, 1.0]})
tm.assert_frame_equal(result - 1, expected - 1, atol=1e-17)

@td.skip_if_no_scipy
def test_corrwith_spearman(self):
# GH#21925
Expand Down

0 comments on commit 2fcb0cd

Please sign in to comment.