From 08239507014ca24d4f249962b349abebab229d72 Mon Sep 17 00:00:00 2001 From: Eric Date: Sat, 18 Oct 2025 17:15:41 -0500 Subject: [PATCH 1/5] Initial commit, no pre-commit --- pandas/_libs/algos.pyx | 82 +++++++++++++++++++++++++----------------- test.py | 42 ++++++++++++++++++++++ 2 files changed, 91 insertions(+), 33 deletions(-) create mode 100644 test.py diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 222b5b7952c2e..5b641fa916c77 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -345,7 +345,8 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): float64_t[:, ::1] result uint8_t[:, :] mask int64_t nobs = 0 - float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy, val + float64_t xx, xy, meanx, meany, divisor, number, v1sq, v2sq, val + float64_t* vx, vy N, K = (mat).shape if minp is None: @@ -357,40 +358,55 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): mask = np.isfinite(mat).view(np.uint8) with nogil: - for xi in range(K): - for yi in range(xi + 1): + # for xi in range(K): + # for yi in range(xi + 1): # Welford's method for the variance-calculation # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0 - for i in range(N): - if mask[i, xi] and mask[i, yi]: - vx = mat[i, xi] - vy = mat[i, yi] - nobs += 1 - dx = vx - meanx - dy = vy - meany - meanx += 1. / nobs * dx - meany += 1. / nobs * dy - ssqdmx += (vx - meanx) * dx - ssqdmy += (vy - meany) * dy - covxy += (vx - meanx) * dy - - if nobs < minpv: - result[xi, yi] = result[yi, xi] = NaN - else: - divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy) - - # clip `covxy / divisor` to ensure coeff is within bounds - if divisor != 0: - val = covxy / divisor - if not cov: - if val > 1.0: - val = 1.0 - elif val < -1.0: - val = -1.0 - result[xi, yi] = result[yi, xi] = val - else: - result[xi, yi] = result[yi, xi] = NaN + nobs = v1sq = v2sq = covxy = meanx = meany = 0 + for i in range(N): + vx = ptr(mat[i,:]) + vy = mat[i,:] + meanx = vx.mean() + meany = vy.mean() + for j in range(vx): + xx = (vx[j]- meanx) + yy = (vy[j]- meany) + nobs += 1 + number += xx*yy + v1sq += xx*xx + v2sq += yy*yy + val = number/sqrt(v1sq * v2sq) + print(val) + + # for i in range(N): + # if mask[i, xi] and mask[i, yi]: + # vx = mat[i, xi] + # vy = mat[i, yi] + # nobs += 1 + # dx = vx - meanx + # dy = vy - meany + # meanx += 1. / nobs * dx + # meany += 1. / nobs * dy + # ssqdmx += (vx - meanx) * dx + # ssqdmy += (vy - meany) * dy + # covxy += (vx - meanx) * dy + + # if nobs < minpv: + # result[xi, yi] = result[yi, xi] = NaN + # else:esult[xi, yi] = result[yi, xi] = val + # divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy) + + # # clip `covxy / divisor` to ensure coeff is within bounds + # if divisor != 0: + # val = covxy / divisor + # if not cov: + # if val > 1.0: + # val = 1.0 + # elif val < -1.0: + # val = -1.0 + # result[xi, yi] = result[yi, xi] = val + # else: + # result[xi, yi] = result[yi, xi] = NaN return result.base diff --git a/test.py b/test.py new file mode 100644 index 0000000000000..e92cff27a90f0 --- /dev/null +++ b/test.py @@ -0,0 +1,42 @@ +import pandas as pd + +values = [ + {"col1": 30.0, "col2": 116.80000305175781}, + {"col1": None, "col2": None}, + {"col1": None, "col2": None}, + {"col1": None, "col2": None}, + {"col1": None, "col2": None}, + {"col1": None, "col2": None}, + {"col1": None, "col2": None}, + {"col1": None, "col2": None}, + {"col1": None, "col2": None}, + {"col1": None, "col2": None}, + {"col1": None, "col2": None}, + {"col1": None, "col2": None}, + {"col1": None, "col2": None}, + {"col1": 30.100000381469727, "col2": 116.8000030517578}, + {"col1": None, "col2": None}, + {"col1": None, "col2": None}, +] + +data = pd.DataFrame(values) +print(data.corr(method="pearson")) + +# def corr_coef(X,Y): +# x1 = np.array(X) +# y1 = np.array(Y) +# x_m=x1.mean() +# y_m=y1.mean() +# number=0 +# v1sq=0 +# v2sq=0 +# for i in range(len(x1)): +# xx = (x1[i]-x_m) +# yy = (y1[i]-y_m) +# number+=xx*yy +# v1sq+=xx*xx +# v2sq+=yy*yy +# return(number/(math.sqrt(v1sq*v2sq))) + +data = data.dropna() +# print(corr_coef(data.iloc[:,0],data.iloc[:,1])) From e20b0451db79c42ca7bec7aa089e77bdfa011e78 Mon Sep 17 00:00:00 2001 From: Eric Chen Date: Sun, 19 Oct 2025 02:44:19 -0500 Subject: [PATCH 2/5] Added test case for welford failure --- pandas/tests/series/methods/test_cov_corr.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/series/methods/test_cov_corr.py b/pandas/tests/series/methods/test_cov_corr.py index 7a4d48fb76940..f5fda99dffc46 100644 --- a/pandas/tests/series/methods/test_cov_corr.py +++ b/pandas/tests/series/methods/test_cov_corr.py @@ -184,3 +184,15 @@ def test_corr_callable_method(self, datetime_series): df = pd.DataFrame([s1, s2]) expected = pd.DataFrame([{0: 1.0, 1: 0}, {0: 0, 1: 1.0}]) tm.assert_almost_equal(df.transpose().corr(method=my_corr), expected) + + def test_close_corr(self): + values = np.array([ + [30.0, 30.100000381469727], + [116.80000305175781, 116.8000030517578] + ]) + df = pd.DataFrame(values.T) + result = df.corr(method="pearson") + expected = pd.DataFrame(np.corrcoef(values[0], values[1])) + tm.assert_frame_equal(result, expected) + + From 28fb76508a9ad0a70641540d19c6b2db8d9b9d2a Mon Sep 17 00:00:00 2001 From: Eric Chen Date: Sun, 19 Oct 2025 02:46:00 -0500 Subject: [PATCH 3/5] Removed test file --- test.py | 42 ------------------------------------------ 1 file changed, 42 deletions(-) delete mode 100644 test.py diff --git a/test.py b/test.py deleted file mode 100644 index e92cff27a90f0..0000000000000 --- a/test.py +++ /dev/null @@ -1,42 +0,0 @@ -import pandas as pd - -values = [ - {"col1": 30.0, "col2": 116.80000305175781}, - {"col1": None, "col2": None}, - {"col1": None, "col2": None}, - {"col1": None, "col2": None}, - {"col1": None, "col2": None}, - {"col1": None, "col2": None}, - {"col1": None, "col2": None}, - {"col1": None, "col2": None}, - {"col1": None, "col2": None}, - {"col1": None, "col2": None}, - {"col1": None, "col2": None}, - {"col1": None, "col2": None}, - {"col1": None, "col2": None}, - {"col1": 30.100000381469727, "col2": 116.8000030517578}, - {"col1": None, "col2": None}, - {"col1": None, "col2": None}, -] - -data = pd.DataFrame(values) -print(data.corr(method="pearson")) - -# def corr_coef(X,Y): -# x1 = np.array(X) -# y1 = np.array(Y) -# x_m=x1.mean() -# y_m=y1.mean() -# number=0 -# v1sq=0 -# v2sq=0 -# for i in range(len(x1)): -# xx = (x1[i]-x_m) -# yy = (y1[i]-y_m) -# number+=xx*yy -# v1sq+=xx*xx -# v2sq+=yy*yy -# return(number/(math.sqrt(v1sq*v2sq))) - -data = data.dropna() -# print(corr_coef(data.iloc[:,0],data.iloc[:,1])) From d1c1e83ac30d9df51a0d76ed721aa8e13ea82b70 Mon Sep 17 00:00:00 2001 From: Eric Chen Date: Sun, 19 Oct 2025 03:20:52 -0500 Subject: [PATCH 4/5] Implemented two pass welford for improved numeric stability --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_libs/algos.pyx | 86 +++++++++------------ pandas/tests/frame/methods/test_cov_corr.py | 12 ++- 3 files changed, 47 insertions(+), 52 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index eb938a7140e29..a6e60f913d2f4 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -170,6 +170,7 @@ Other enhancements - :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`) - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`) - :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`). +- :meth:`DataFrame.corr` now uses two pass Welford's Method to improve numerical stability with precision for very large/small values (:issue:`59652`) - :meth:`Index.get_loc` now accepts also subclasses of ``tuple`` as keys (:issue:`57922`) - :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`) - Added missing parameter ``weights`` in :meth:`DataFrame.plot.kde` for the estimation of the PDF (:issue:`59337`) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 5b641fa916c77..f80e754607ee3 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -345,8 +345,8 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): float64_t[:, ::1] result uint8_t[:, :] mask int64_t nobs = 0 - float64_t xx, xy, meanx, meany, divisor, number, v1sq, v2sq, val - float64_t* vx, vy + float64_t vx, vy, meanx, meany, divisor, ssqdmx, ssqdmy, cxy, val + float64_t sumx, sumy N, K = (mat).shape if minp is None: @@ -358,55 +358,43 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None): mask = np.isfinite(mat).view(np.uint8) with nogil: - # for xi in range(K): - # for yi in range(xi + 1): + for xi in range(K): + for yi in range(xi+1): # Welford's method for the variance-calculation # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - nobs = v1sq = v2sq = covxy = meanx = meany = 0 - for i in range(N): - vx = ptr(mat[i,:]) - vy = mat[i,:] - meanx = vx.mean() - meany = vy.mean() - for j in range(vx): - xx = (vx[j]- meanx) - yy = (vy[j]- meany) - nobs += 1 - number += xx*yy - v1sq += xx*xx - v2sq += yy*yy - val = number/sqrt(v1sq * v2sq) - print(val) - - # for i in range(N): - # if mask[i, xi] and mask[i, yi]: - # vx = mat[i, xi] - # vy = mat[i, yi] - # nobs += 1 - # dx = vx - meanx - # dy = vy - meany - # meanx += 1. / nobs * dx - # meany += 1. / nobs * dy - # ssqdmx += (vx - meanx) * dx - # ssqdmy += (vy - meany) * dy - # covxy += (vx - meanx) * dy - - # if nobs < minpv: - # result[xi, yi] = result[yi, xi] = NaN - # else:esult[xi, yi] = result[yi, xi] = val - # divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy) - - # # clip `covxy / divisor` to ensure coeff is within bounds - # if divisor != 0: - # val = covxy / divisor - # if not cov: - # if val > 1.0: - # val = 1.0 - # elif val < -1.0: - # val = -1.0 - # result[xi, yi] = result[yi, xi] = val - # else: - # result[xi, yi] = result[yi, xi] = NaN + # Changed to Welford's two-pass for improved numeric stability + nobs = ssqdmx = ssqdmy = cxy = meanx = meany = 0 + sumx = sumy = 0 + for i in range(N): + if mask[i, xi] and mask[i, yi]: + sumx += mat[i, xi] + sumy += mat[i, yi] + nobs += 1 + if nobs < minpv: + result[xi, yi] = result[yi, xi] = NaN + continue + meanx = sumx / nobs + meany = sumy / nobs + for i in range(N): + if mask[i, xi] and mask[i, yi]: + vx = mat[i, xi] - meanx + vy = mat[i, yi] - meany + cxy += vx * vy + ssqdmx += vx * vx + ssqdmy += vy * vy + divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy) + + # clip `covxy / divisor` to ensure coeff is within bounds + if divisor != 0: + val = cxy / divisor + if not cov: + if val > 1.0: + val = 1.0 + elif val < -1.0: + val = -1.0 + result[xi, yi] = result[yi, xi] = val + else: + result[xi, yi] = result[yi, xi] = NaN return result.base diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index a5ed2e86283e9..6998389a31976 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -210,11 +210,17 @@ def test_corr_nullable_integer(self, nullable_column, other_column, method): @pytest.mark.parametrize("length", [2, 20, 200, 2000]) def test_corr_for_constant_columns(self, length): # GH: 37448 + #now matches numpy behavior df = DataFrame(length * [[0.4, 0.1]], columns=["A", "B"]) result = df.corr() - expected = DataFrame( - {"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"] - ) + if length == 2: + expected = DataFrame( + {"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"] + ) + else: + expected = DataFrame( + {"A": [1., 1.], "B": [1., 1.]}, index=["A", "B"] + ) tm.assert_frame_equal(result, expected) def test_calc_corr_small_numbers(self): From 8af06ed86af0eb16abf2a3d159b5cc931dec4873 Mon Sep 17 00:00:00 2001 From: Eric Chen Date: Sun, 19 Oct 2025 03:36:08 -0500 Subject: [PATCH 5/5] pre-commit edits, moved test case --- pandas/tests/frame/methods/test_cov_corr.py | 15 +++++++++++---- pandas/tests/series/methods/test_cov_corr.py | 12 ------------ 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 6998389a31976..e701363d4a2bb 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -210,7 +210,7 @@ def test_corr_nullable_integer(self, nullable_column, other_column, method): @pytest.mark.parametrize("length", [2, 20, 200, 2000]) def test_corr_for_constant_columns(self, length): # GH: 37448 - #now matches numpy behavior + # now matches numpy behavior df = DataFrame(length * [[0.4, 0.1]], columns=["A", "B"]) result = df.corr() if length == 2: @@ -218,9 +218,7 @@ def test_corr_for_constant_columns(self, length): {"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"] ) else: - expected = DataFrame( - {"A": [1., 1.], "B": [1., 1.]}, index=["A", "B"] - ) + expected = DataFrame({"A": [1.0, 1.0], "B": [1.0, 1.0]}, index=["A", "B"]) tm.assert_frame_equal(result, expected) def test_calc_corr_small_numbers(self): @@ -499,3 +497,12 @@ def test_cov_with_missing_values(self): result2 = df.dropna().cov() tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) + + def test_close_corr(self): + values = np.array( + [[30.0, 30.100000381469727], [116.80000305175781, 116.8000030517578]] + ) + df = DataFrame(values.T) + result = df.corr(method="pearson") + expected = DataFrame(np.corrcoef(values[0], values[1])) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_cov_corr.py b/pandas/tests/series/methods/test_cov_corr.py index f5fda99dffc46..7a4d48fb76940 100644 --- a/pandas/tests/series/methods/test_cov_corr.py +++ b/pandas/tests/series/methods/test_cov_corr.py @@ -184,15 +184,3 @@ def test_corr_callable_method(self, datetime_series): df = pd.DataFrame([s1, s2]) expected = pd.DataFrame([{0: 1.0, 1: 0}, {0: 0, 1: 1.0}]) tm.assert_almost_equal(df.transpose().corr(method=my_corr), expected) - - def test_close_corr(self): - values = np.array([ - [30.0, 30.100000381469727], - [116.80000305175781, 116.8000030517578] - ]) - df = pd.DataFrame(values.T) - result = df.corr(method="pearson") - expected = pd.DataFrame(np.corrcoef(values[0], values[1])) - tm.assert_frame_equal(result, expected) - -