[BACKPORT] Implements mt.stats.ks_1samp (#2335) (#2341)

mars-project · Aug 16, 2021 · 9c2da51 · 9c2da51
1 parent 8f44101
commit 9c2da51
Show file tree

Hide file tree

Showing 7 changed files with 228 additions and 40 deletions.
diff --git a/.github/workflows/cancel-prev.yml b/.github/workflows/cancel-prev.yml
@@ -6,7 +6,7 @@ jobs:
   cancel:
     runs-on: ubuntu-latest
     steps:
-      - uses: styfle/cancel-workflow-action@0.8.0
+      - uses: styfle/cancel-workflow-action@0.9.1
         with:
           access_token: ${{ github.token }}
           workflow_id: core-ci.yml,os-compat-ci.yml,platform-ci.yml,checks.yml
diff --git a/docs/source/reference/tensor/statistics.rst b/docs/source/reference/tensor/statistics.rst
@@ -62,6 +62,7 @@ Statistical tests
    :nosignatures:
 
    mars.tensor.stats.chisquare
+   mars.tensor.stats.ks_1samp
    mars.tensor.stats.ks_2samp
    mars.tensor.stats.power_divergence
    mars.tensor.stats.ttest_1samp

diff --git a/mars/tensor/stats/__init__.py b/mars/tensor/stats/__init__.py
@@ -14,6 +14,6 @@
 
 from .entropy import entropy
 from .chisquare import chisquare
+from .ks import ks_1samp, ks_2samp
 from .power_divergence import power_divergence
 from .ttest import ttest_1samp, ttest_ind, ttest_ind_from_stats, ttest_rel
-from .ks_2samp import ks_2samp
diff --git a/mars/tensor/stats/core.py b/mars/tensor/stats/core.py
diff --git a/mars/tensor/stats/ks_2samp.py → mars/tensor/stats/ks.py b/mars/tensor/stats/ks_2samp.py → mars/tensor/stats/ks.py
@@ -15,7 +15,8 @@
 import math
 import warnings
 from math import gcd
-from typing import Union
+from collections import namedtuple
+from typing import Callable, Tuple, Union
 
 import numpy as np
 from scipy import special
@@ -24,9 +25,9 @@
 from ... import tensor as mt
 from ...core import ExecutableTuple
 from ...typing import TileableType
-from .core import KstestResult
 
 
+KstestResult = namedtuple('KstestResult', ('statistic', 'pvalue'))
 Ks_2sampResult = KstestResult
 
 
@@ -300,7 +301,7 @@ def _attempt_exact_2kssamp(n1, n2, g, d, alternative):  # pragma: no cover
     return True, d, prob
 
 
-def _calc_prob(d, n1, n2, alternative, mode):  # pragma: no cover
+def _calc_prob_2samp(d, n1, n2, alternative, mode):  # pragma: no cover
     MAX_AUTO_N = 10000  # 'auto' will attempt to be exact if n1,n2 <= MAX_AUTO_N
 
     g = gcd(n1, n2)
@@ -327,7 +328,7 @@ def _calc_prob(d, n1, n2, alternative, mode):  # pragma: no cover
                               f"Switching to mode={mode}.", RuntimeWarning)
 
     if mode == 'asymp':
-        # The product n1*n2 is large.  Use Smirnov's asymptoptic formula.
+        # The product n1*n2 is large.  Use Smirnov's asymptotic formula.
         # Ensure float to avoid overflow in multiplication
         # sorted because the one-sided formula is not symmetric in n1, n2
         m, n = sorted([float(n1), float(n2)], reverse=True)
@@ -344,6 +345,188 @@ def _calc_prob(d, n1, n2, alternative, mode):  # pragma: no cover
     return np.clip(prob, 0, 1)
 
 
+def _compute_dplus(cdfvals, n):
+    """Computes D+ as used in the Kolmogorov-Smirnov test.
+    Parameters
+    ----------
+    cdfvals: array_like
+      Sorted array of CDF values between 0 and 1
+    Returns
+    -------
+      Maximum distance of the CDF values below Uniform(0, 1)
+    """
+    return (mt.arange(1.0, n + 1) / n - cdfvals).max()
+
+
+def _compute_dminus(cdfvals, n):
+    """Computes D- as used in the Kolmogorov-Smirnov test.
+    Parameters
+    ----------
+    cdfvals: array_like
+      Sorted array of CDF values between 0 and 1
+    Returns
+    -------
+      Maximum distance of the CDF values above Uniform(0, 1)
+    """
+    return (cdfvals - mt.arange(0.0, n) / n).max()
+
+
+def ks_1samp(x: Union[np.ndarray, list, TileableType],
+             cdf: Callable,
+             args: Tuple = (),
+             alternative: str = 'two-sided',
+             mode: str = 'auto'):
+    """
+    Performs the one-sample Kolmogorov-Smirnov test for goodness of fit.
+
+    This test compares the underlying distribution F(x) of a sample
+    against a given continuous distribution G(x). See Notes for a description
+    of the available null and alternative hypotheses.
+
+    Parameters
+    ----------
+    x : array_like
+        a 1-D array of observations of iid random variables.
+    cdf : callable
+        callable used to calculate the cdf.
+    args : tuple, sequence, optional
+        Distribution parameters, used with `cdf`.
+    alternative : {'two-sided', 'less', 'greater'}, optional
+        Defines the null and alternative hypotheses. Default is 'two-sided'.
+        Please see explanations in the Notes below.
+    mode : {'auto', 'exact', 'approx', 'asymp'}, optional
+        Defines the distribution used for calculating the p-value.
+        The following options are available (default is 'auto'):
+
+          * 'auto' : selects one of the other options.
+          * 'exact' : uses the exact distribution of test statistic.
+          * 'approx' : approximates the two-sided probability with twice
+            the one-sided probability
+          * 'asymp': uses asymptotic distribution of test statistic
+
+    Returns
+    -------
+    statistic : float
+        KS test statistic, either D, D+ or D- (depending on the value
+        of 'alternative')
+    pvalue :  float
+        One-tailed or two-tailed p-value.
+
+    See Also
+    --------
+    ks_2samp, kstest
+
+    Notes
+    -----
+    There are three options for the null and corresponding alternative
+    hypothesis that can be selected using the `alternative` parameter.
+
+    - `two-sided`: The null hypothesis is that the two distributions are
+      identical, F(x)=G(x) for all x; the alternative is that they are not
+      identical.
+
+    - `less`: The null hypothesis is that F(x) >= G(x) for all x; the
+      alternative is that F(x) < G(x) for at least one x.
+
+    - `greater`: The null hypothesis is that F(x) <= G(x) for all x; the
+      alternative is that F(x) > G(x) for at least one x.
+
+    Note that the alternative hypotheses describe the *CDFs* of the
+    underlying distributions, not the observed values. For example,
+    suppose x1 ~ F and x2 ~ G. If F(x) > G(x) for all x, the values in
+    x1 tend to be less than those in x2.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy import stats
+    >>> import mars.tensor as mt
+    >>> from mars.tensor.stats import ks_1samp
+
+    >>> np.random.seed(12345678)  #fix random seed to get the same result
+    >>> x = mt.linspace(-15, 15, 9, chunk_size=5)
+    >>> ks_1samp(x, stats.norm.cdf).execute()
+    (0.44435602715924361, 0.038850142705171065)
+
+    >>> ks_1samp(stats.norm.rvs(size=100), stats.norm.cdf).execute()
+    KstestResult(statistic=0.165471391799..., pvalue=0.007331283245...)
+
+    *Test against one-sided alternative hypothesis*
+
+    Shift distribution to larger values, so that `` CDF(x) < norm.cdf(x)``:
+
+    >>> x = stats.norm.rvs(loc=0.2, size=100)
+    >>> ks_1samp(x, stats.norm.cdf, alternative='less').execute()
+    KstestResult(statistic=0.235488541678..., pvalue=1.158315030683...)
+
+    Reject null hypothesis in favor of alternative hypothesis: less
+
+    >>> ks_1samp(x, stats.norm.cdf, alternative='greater').execute()
+    KstestResult(statistic=0.010167165616..., pvalue=0.972494973653...)
+
+    Reject null hypothesis in favor of alternative hypothesis: greater
+
+    >>> ks_1samp(x, stats.norm.cdf).execute()
+    KstestResult(statistic=0.235488541678..., pvalue=2.316630061366...)
+
+    Don't reject null hypothesis in favor of alternative hypothesis: two-sided
+
+    *Testing t distributed random variables against normal distribution*
+
+    With 100 degrees of freedom the t distribution looks close to the normal
+    distribution, and the K-S test does not reject the hypothesis that the
+    sample came from the normal distribution:
+
+    >>> ks_1samp(stats.t.rvs(100, size=100), stats.norm.cdf).execute()
+    KstestResult(statistic=0.077844250253..., pvalue=0.553155412513...)
+
+    With 3 degrees of freedom the t distribution looks sufficiently different
+    from the normal distribution, that we can reject the hypothesis that the
+    sample came from the normal distribution at the 10% level:
+
+    >>> ks_1samp(stats.t.rvs(3, size=100), stats.norm.cdf).execute()
+    KstestResult(statistic=0.118967105356..., pvalue=0.108627114578...)
+    """
+    alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get(
+       alternative.lower()[0], alternative)
+    if alternative not in ['two-sided', 'greater', 'less']:
+        raise ValueError("Unexpected alternative %s" % alternative)
+
+    x = mt.asarray(x)
+    N = x.shape[0]
+    x = mt.sort(x)
+    cdfvals = x.map_chunk(cdf, args=args, elementwise=True)
+
+    if alternative == 'greater':
+        Dplus = _compute_dplus(cdfvals, N)
+        return ExecutableTuple(KstestResult(
+            Dplus, Dplus.map_chunk(distributions.ksone.sf, args=(N,))))
+
+    if alternative == 'less':
+        Dminus = _compute_dminus(cdfvals, N)
+        return ExecutableTuple(KstestResult(
+            Dminus, Dminus.map_chunk(distributions.ksone.sf, args=(N,))))
+
+    # alternative == 'two-sided':
+    Dplus = _compute_dplus(cdfvals, N)
+    Dminus = _compute_dminus(cdfvals, N)
+    D = mt.max([Dplus, Dminus])
+    if mode == 'auto':  # Always select exact
+        mode = 'exact'
+    if mode == 'exact':
+        prob = D.map_chunk(distributions.kstwo.sf, args=(N,),
+                           elementwise=True)
+    elif mode == 'asymp':
+        prob = (D * np.sqrt(N)).map_chunk(distributions.kstwobign.sf,
+                                          elementwise=True)
+    else:
+        # mode == 'approx'
+        prob = 2 * D.map_chunk(distributions.ksone.sf, args=(N,),
+                               elementwise=True)
+    prob = mt.clip(prob, 0, 1)
+    return ExecutableTuple(KstestResult(D, prob))
+
+
 def ks_2samp(data1: Union[np.ndarray, list, TileableType],
              data2: Union[np.ndarray, list, TileableType],
              alternative: str = 'two-sided',
@@ -477,7 +660,7 @@ def ks_2samp(data1: Union[np.ndarray, list, TileableType],
     maxS = mt.max(cddiffs)
     alt2Dvalue = {'less': minS, 'greater': maxS, 'two-sided': mt.maximum(minS, maxS)}
     d = alt2Dvalue[alternative]
-    prob = d.map_chunk(_calc_prob, args=(n1, n2, alternative, mode),
+    prob = d.map_chunk(_calc_prob_2samp, args=(n1, n2, alternative, mode),
                        elementwise=True, dtype=d.dtype)
 
     return ExecutableTuple(Ks_2sampResult(d, prob))
diff --git a/mars/tensor/stats/tests/test_stats_execute.py b/mars/tensor/stats/tests/test_stats_execute.py
@@ -22,20 +22,21 @@
     entropy as sp_entropy,
     power_divergence as sp_power_divergence,
     chisquare as sp_chisquare,
+    ks_1samp as sp_ks_1samp,
     ks_2samp as sp_ks_2samp,
+    norm as sp_norm,
     ttest_rel as sp_ttest_rel,
     ttest_ind as sp_ttest_ind,
     ttest_ind_from_stats as sp_ttest_ind_from_stats,
     ttest_1samp as sp_ttest_1samp,
-    norm as sp_norm
 )
 
 from mars.lib.version import parse as parse_version
 from mars.tensor import tensor
 from mars.tensor.stats import (
     entropy, power_divergence, chisquare,
     ttest_ind, ttest_rel, ttest_1samp, ttest_ind_from_stats,
-    ks_2samp
+    ks_1samp, ks_2samp,
 )
 
 
@@ -197,6 +198,32 @@ def test_t_test_execution(setup):
             np.testing.assert_almost_equal(expected[1], result[1])
 
 
+@pytest.mark.parametrize('chunk_size', [5, 15])
+@pytest.mark.parametrize('mode', ['auto', 'less', 'greater', 'asymp', 'approx'])
+def test_ks_1samp(setup, chunk_size, mode):
+    x = tensor(np.linspace(-15, 15, 9), chunk_size=5)
+
+    if mode == 'auto':
+        result = ks_1samp(x, sp_norm.cdf,
+                          alternative='greater').execute().fetch()
+        expected = sp_ks_1samp(x, sp_norm.cdf,
+                               alternative='greater')
+        assert result == expected
+
+        result = ks_1samp(x, sp_norm.cdf,
+                          alternative='less').execute().fetch()
+        expected = sp_ks_1samp(x, sp_norm.cdf,
+                               alternative='less')
+        assert result == expected
+
+    result = ks_1samp(x, sp_norm.cdf, mode=mode).execute().fetch()
+    expected = sp_ks_1samp(x, sp_norm.cdf, mode=mode)
+    assert result == expected
+
+    with pytest.raises(ValueError):
+        ks_1samp(x, sp_norm.cdf, alternative='unknown')
+
+
 @pytest.mark.parametrize('chunk_size', [5, 15])
 def test_ks_2samp(setup, chunk_size):
     n1 = 10

diff --git a/mars/tensor/stats/ttest.py b/mars/tensor/stats/ttest.py
@@ -15,19 +15,14 @@
 from collections import namedtuple
 
 import numpy as np
-try:
-    from scipy import __version__ as sp_version
-    from scipy.stats import (
-        ttest_ind as sp_ttest_ind,
-        ttest_ind_from_stats as sp_ttest_ind_from_stats,
-        ttest_rel as sp_ttest_rel,
-        ttest_1samp as sp_ttest_1samp,
-    )
-    from scipy.stats import distributions as sp_distributions
-except ImportError:
-    sp_version = None
-    sp_ttest_1samp = sp_ttest_ind = sp_ttest_ind_from_stats = sp_ttest_rel = None
-    sp_distributions = None
+from scipy import __version__ as sp_version
+from scipy.stats import (
+    ttest_ind as sp_ttest_ind,
+    ttest_ind_from_stats as sp_ttest_ind_from_stats,
+    ttest_rel as sp_ttest_rel,
+    ttest_1samp as sp_ttest_1samp,
+)
+from scipy.stats import distributions as sp_distributions
 
 from ...core import ExecutableTuple
 from ...lib.version import parse as parse_version