From c75bc10f58b353fa4cc2f91b7d182d59481f0866 Mon Sep 17 00:00:00 2001 From: Shadi Akiki Date: Thu, 13 Sep 2018 08:31:49 +0000 Subject: [PATCH] ENH: correlation function accepts method being a callable - other than the listed strings for the `method` argument, accept a callable for generic correlation calculations - minor fix of = to == in requirements file --- doc/source/computation.rst | 15 +++++++++++++ doc/source/whatsnew/v0.24.0.txt | 3 +++ pandas/core/frame.py | 8 +++++-- pandas/core/nanops.py | 3 +++ pandas/core/series.py | 8 +++++-- pandas/tests/series/test_analytics.py | 32 +++++++++++++++++++++++++++ 6 files changed, 65 insertions(+), 4 deletions(-) diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 5e7b8be5f8af04..0d2021de8f88e0 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -153,6 +153,21 @@ Like ``cov``, ``corr`` also supports the optional ``min_periods`` keyword: frame.corr(min_periods=12) +.. versionadded:: 0.24.0 + +The ``method`` argument can also be a callable for a generic correlation +calculation. In this case, it should be a single function +that produces a single value from two ndarray inputs. Suppose we wanted to +compute the correlation based on histogram intersection: + +.. ipython:: python + + # histogram intersection + histogram_intersection = lambda a, b: np.minimum( + np.true_divide(a, a.sum()), np.true_divide(b, b.sum()) + ).sum() + frame.corr(method=histogram_intersection) + A related method :meth:`~DataFrame.corrwith` is implemented on DataFrame to compute the correlation between like-labeled Series contained in different DataFrame objects. diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 649629714c3b1f..45fb2f8f13931b 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -17,6 +17,9 @@ New features - ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) +- :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) + + .. _whatsnew_0240.enhancements.extension_array_operators: ``ExtensionArray`` operator support diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 251bc6587872d6..b3a62f7a21b08e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6652,10 +6652,14 @@ def corr(self, method='pearson', min_periods=1): Parameters ---------- - method : {'pearson', 'kendall', 'spearman'} + method : {'pearson', 'kendall', 'spearman'} or callable * pearson : standard correlation coefficient * kendall : Kendall Tau correlation coefficient * spearman : Spearman rank correlation + * callable: callable with input two 1d ndarrays + and returning a float + .. versionadded:: 0.24.0 + min_periods : int, optional Minimum number of observations required per pair of columns to have a valid result. Currently only available for pearson @@ -6675,7 +6679,7 @@ def corr(self, method='pearson', min_periods=1): elif method == 'spearman': correl = libalgos.nancorr_spearman(ensure_float64(mat), minp=min_periods) - elif method == 'kendall': + elif method == 'kendall' or callable(method): if min_periods is None: min_periods = 1 mat = ensure_float64(mat).T diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index f44fb4f6e9e144..278bd7544e343f 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -764,6 +764,9 @@ def nancorr(a, b, method='pearson', min_periods=None): def get_corr_func(method): + if callable(method): + return method + if method in ['kendall', 'spearman']: from scipy.stats import kendalltau, spearmanr diff --git a/pandas/core/series.py b/pandas/core/series.py index a4d403e4bcd948..0e616b130b6082 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1913,10 +1913,14 @@ def corr(self, other, method='pearson', min_periods=None): Parameters ---------- other : Series - method : {'pearson', 'kendall', 'spearman'} + method : {'pearson', 'kendall', 'spearman'} or callable * pearson : standard correlation coefficient * kendall : Kendall Tau correlation coefficient * spearman : Spearman rank correlation + * callable: callable with input two 1d ndarray + and returning a float + .. versionadded:: 0.24.0 + min_periods : int, optional Minimum number of observations needed to have a valid result @@ -1929,7 +1933,7 @@ def corr(self, other, method='pearson', min_periods=None): if len(this) == 0: return np.nan - if method in ['pearson', 'spearman', 'kendall']: + if method in ['pearson', 'spearman', 'kendall'] or callable(method): return nanops.nancorr(this.values, other.values, method=method, min_periods=min_periods) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index d5d9e5f4f14de2..1bdb96cf5cc564 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -789,6 +789,38 @@ def test_corr_invalid_method(self): with tm.assert_raises_regex(ValueError, msg): s1.corr(s2, method="____") + def test_corr_callable_method(self): + # simple correlation example + # returns 1 if exact equality, 0 otherwise + my_corr = lambda a, b: 1. if (a == b).all() else 0. + + # simple example + s1 = Series([1, 2, 3, 4, 5]) + s2 = Series([5, 4, 3, 2, 1]) + expected_1 = 0 + tm.assert_almost_equal( + s1.corr(s2, method=my_corr), + expected_1) + + # full overlap + tm.assert_almost_equal( + self.ts.corr(self.ts, method=my_corr), 1.) + + # partial overlap + tm.assert_almost_equal( + self.ts[:15].corr(self.ts[5:], method=my_corr), 1.) + + # No overlap + assert np.isnan( + self.ts[::2].corr(self.ts[1::2], method=my_corr)) + + # dataframe example + df = pd.DataFrame([s1, s2]) + expected_2 = pd.DataFrame([ + {0: 1., 1: expected_1}, {0: expected_1, 1: 1.}]) + tm.assert_almost_equal( + df.transpose().corr(method=my_corr), expected_2) + def test_cov(self): # full overlap tm.assert_almost_equal(self.ts.cov(self.ts), self.ts.std() ** 2)