Skip to content

Commit

Permalink
[BACKPORT] Implements mt.stats.ks_1samp (#2335) (#2341)
Browse files Browse the repository at this point in the history
  • Loading branch information
wjsi committed Aug 16, 2021
1 parent 8f44101 commit 9c2da51
Show file tree
Hide file tree
Showing 7 changed files with 228 additions and 40 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/cancel-prev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ jobs:
cancel:
runs-on: ubuntu-latest
steps:
- uses: styfle/cancel-workflow-action@0.8.0
- uses: styfle/cancel-workflow-action@0.9.1
with:
access_token: ${{ github.token }}
workflow_id: core-ci.yml,os-compat-ci.yml,platform-ci.yml,checks.yml
1 change: 1 addition & 0 deletions docs/source/reference/tensor/statistics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ Statistical tests
:nosignatures:

mars.tensor.stats.chisquare
mars.tensor.stats.ks_1samp
mars.tensor.stats.ks_2samp
mars.tensor.stats.power_divergence
mars.tensor.stats.ttest_1samp
Expand Down
2 changes: 1 addition & 1 deletion mars/tensor/stats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@

from .entropy import entropy
from .chisquare import chisquare
from .ks import ks_1samp, ks_2samp
from .power_divergence import power_divergence
from .ttest import ttest_1samp, ttest_ind, ttest_ind_from_stats, ttest_rel
from .ks_2samp import ks_2samp
18 changes: 0 additions & 18 deletions mars/tensor/stats/core.py

This file was deleted.

193 changes: 188 additions & 5 deletions mars/tensor/stats/ks_2samp.py → mars/tensor/stats/ks.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
import math
import warnings
from math import gcd
from typing import Union
from collections import namedtuple
from typing import Callable, Tuple, Union

import numpy as np
from scipy import special
Expand All @@ -24,9 +25,9 @@
from ... import tensor as mt
from ...core import ExecutableTuple
from ...typing import TileableType
from .core import KstestResult


KstestResult = namedtuple('KstestResult', ('statistic', 'pvalue'))
Ks_2sampResult = KstestResult


Expand Down Expand Up @@ -300,7 +301,7 @@ def _attempt_exact_2kssamp(n1, n2, g, d, alternative): # pragma: no cover
return True, d, prob


def _calc_prob(d, n1, n2, alternative, mode): # pragma: no cover
def _calc_prob_2samp(d, n1, n2, alternative, mode): # pragma: no cover
MAX_AUTO_N = 10000 # 'auto' will attempt to be exact if n1,n2 <= MAX_AUTO_N

g = gcd(n1, n2)
Expand All @@ -327,7 +328,7 @@ def _calc_prob(d, n1, n2, alternative, mode): # pragma: no cover
f"Switching to mode={mode}.", RuntimeWarning)

if mode == 'asymp':
# The product n1*n2 is large. Use Smirnov's asymptoptic formula.
# The product n1*n2 is large. Use Smirnov's asymptotic formula.
# Ensure float to avoid overflow in multiplication
# sorted because the one-sided formula is not symmetric in n1, n2
m, n = sorted([float(n1), float(n2)], reverse=True)
Expand All @@ -344,6 +345,188 @@ def _calc_prob(d, n1, n2, alternative, mode): # pragma: no cover
return np.clip(prob, 0, 1)


def _compute_dplus(cdfvals, n):
"""Computes D+ as used in the Kolmogorov-Smirnov test.
Parameters
----------
cdfvals: array_like
Sorted array of CDF values between 0 and 1
Returns
-------
Maximum distance of the CDF values below Uniform(0, 1)
"""
return (mt.arange(1.0, n + 1) / n - cdfvals).max()


def _compute_dminus(cdfvals, n):
"""Computes D- as used in the Kolmogorov-Smirnov test.
Parameters
----------
cdfvals: array_like
Sorted array of CDF values between 0 and 1
Returns
-------
Maximum distance of the CDF values above Uniform(0, 1)
"""
return (cdfvals - mt.arange(0.0, n) / n).max()


def ks_1samp(x: Union[np.ndarray, list, TileableType],
cdf: Callable,
args: Tuple = (),
alternative: str = 'two-sided',
mode: str = 'auto'):
"""
Performs the one-sample Kolmogorov-Smirnov test for goodness of fit.
This test compares the underlying distribution F(x) of a sample
against a given continuous distribution G(x). See Notes for a description
of the available null and alternative hypotheses.
Parameters
----------
x : array_like
a 1-D array of observations of iid random variables.
cdf : callable
callable used to calculate the cdf.
args : tuple, sequence, optional
Distribution parameters, used with `cdf`.
alternative : {'two-sided', 'less', 'greater'}, optional
Defines the null and alternative hypotheses. Default is 'two-sided'.
Please see explanations in the Notes below.
mode : {'auto', 'exact', 'approx', 'asymp'}, optional
Defines the distribution used for calculating the p-value.
The following options are available (default is 'auto'):
* 'auto' : selects one of the other options.
* 'exact' : uses the exact distribution of test statistic.
* 'approx' : approximates the two-sided probability with twice
the one-sided probability
* 'asymp': uses asymptotic distribution of test statistic
Returns
-------
statistic : float
KS test statistic, either D, D+ or D- (depending on the value
of 'alternative')
pvalue : float
One-tailed or two-tailed p-value.
See Also
--------
ks_2samp, kstest
Notes
-----
There are three options for the null and corresponding alternative
hypothesis that can be selected using the `alternative` parameter.
- `two-sided`: The null hypothesis is that the two distributions are
identical, F(x)=G(x) for all x; the alternative is that they are not
identical.
- `less`: The null hypothesis is that F(x) >= G(x) for all x; the
alternative is that F(x) < G(x) for at least one x.
- `greater`: The null hypothesis is that F(x) <= G(x) for all x; the
alternative is that F(x) > G(x) for at least one x.
Note that the alternative hypotheses describe the *CDFs* of the
underlying distributions, not the observed values. For example,
suppose x1 ~ F and x2 ~ G. If F(x) > G(x) for all x, the values in
x1 tend to be less than those in x2.
Examples
--------
>>> import numpy as np
>>> from scipy import stats
>>> import mars.tensor as mt
>>> from mars.tensor.stats import ks_1samp
>>> np.random.seed(12345678) #fix random seed to get the same result
>>> x = mt.linspace(-15, 15, 9, chunk_size=5)
>>> ks_1samp(x, stats.norm.cdf).execute()
(0.44435602715924361, 0.038850142705171065)
>>> ks_1samp(stats.norm.rvs(size=100), stats.norm.cdf).execute()
KstestResult(statistic=0.165471391799..., pvalue=0.007331283245...)
*Test against one-sided alternative hypothesis*
Shift distribution to larger values, so that `` CDF(x) < norm.cdf(x)``:
>>> x = stats.norm.rvs(loc=0.2, size=100)
>>> ks_1samp(x, stats.norm.cdf, alternative='less').execute()
KstestResult(statistic=0.235488541678..., pvalue=1.158315030683...)
Reject null hypothesis in favor of alternative hypothesis: less
>>> ks_1samp(x, stats.norm.cdf, alternative='greater').execute()
KstestResult(statistic=0.010167165616..., pvalue=0.972494973653...)
Reject null hypothesis in favor of alternative hypothesis: greater
>>> ks_1samp(x, stats.norm.cdf).execute()
KstestResult(statistic=0.235488541678..., pvalue=2.316630061366...)
Don't reject null hypothesis in favor of alternative hypothesis: two-sided
*Testing t distributed random variables against normal distribution*
With 100 degrees of freedom the t distribution looks close to the normal
distribution, and the K-S test does not reject the hypothesis that the
sample came from the normal distribution:
>>> ks_1samp(stats.t.rvs(100, size=100), stats.norm.cdf).execute()
KstestResult(statistic=0.077844250253..., pvalue=0.553155412513...)
With 3 degrees of freedom the t distribution looks sufficiently different
from the normal distribution, that we can reject the hypothesis that the
sample came from the normal distribution at the 10% level:
>>> ks_1samp(stats.t.rvs(3, size=100), stats.norm.cdf).execute()
KstestResult(statistic=0.118967105356..., pvalue=0.108627114578...)
"""
alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get(
alternative.lower()[0], alternative)
if alternative not in ['two-sided', 'greater', 'less']:
raise ValueError("Unexpected alternative %s" % alternative)

x = mt.asarray(x)
N = x.shape[0]
x = mt.sort(x)
cdfvals = x.map_chunk(cdf, args=args, elementwise=True)

if alternative == 'greater':
Dplus = _compute_dplus(cdfvals, N)
return ExecutableTuple(KstestResult(
Dplus, Dplus.map_chunk(distributions.ksone.sf, args=(N,))))

if alternative == 'less':
Dminus = _compute_dminus(cdfvals, N)
return ExecutableTuple(KstestResult(
Dminus, Dminus.map_chunk(distributions.ksone.sf, args=(N,))))

# alternative == 'two-sided':
Dplus = _compute_dplus(cdfvals, N)
Dminus = _compute_dminus(cdfvals, N)
D = mt.max([Dplus, Dminus])
if mode == 'auto': # Always select exact
mode = 'exact'
if mode == 'exact':
prob = D.map_chunk(distributions.kstwo.sf, args=(N,),
elementwise=True)
elif mode == 'asymp':
prob = (D * np.sqrt(N)).map_chunk(distributions.kstwobign.sf,
elementwise=True)
else:
# mode == 'approx'
prob = 2 * D.map_chunk(distributions.ksone.sf, args=(N,),
elementwise=True)
prob = mt.clip(prob, 0, 1)
return ExecutableTuple(KstestResult(D, prob))


def ks_2samp(data1: Union[np.ndarray, list, TileableType],
data2: Union[np.ndarray, list, TileableType],
alternative: str = 'two-sided',
Expand Down Expand Up @@ -477,7 +660,7 @@ def ks_2samp(data1: Union[np.ndarray, list, TileableType],
maxS = mt.max(cddiffs)
alt2Dvalue = {'less': minS, 'greater': maxS, 'two-sided': mt.maximum(minS, maxS)}
d = alt2Dvalue[alternative]
prob = d.map_chunk(_calc_prob, args=(n1, n2, alternative, mode),
prob = d.map_chunk(_calc_prob_2samp, args=(n1, n2, alternative, mode),
elementwise=True, dtype=d.dtype)

return ExecutableTuple(Ks_2sampResult(d, prob))
31 changes: 29 additions & 2 deletions mars/tensor/stats/tests/test_stats_execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,21 @@
entropy as sp_entropy,
power_divergence as sp_power_divergence,
chisquare as sp_chisquare,
ks_1samp as sp_ks_1samp,
ks_2samp as sp_ks_2samp,
norm as sp_norm,
ttest_rel as sp_ttest_rel,
ttest_ind as sp_ttest_ind,
ttest_ind_from_stats as sp_ttest_ind_from_stats,
ttest_1samp as sp_ttest_1samp,
norm as sp_norm
)

from mars.lib.version import parse as parse_version
from mars.tensor import tensor
from mars.tensor.stats import (
entropy, power_divergence, chisquare,
ttest_ind, ttest_rel, ttest_1samp, ttest_ind_from_stats,
ks_2samp
ks_1samp, ks_2samp,
)


Expand Down Expand Up @@ -197,6 +198,32 @@ def test_t_test_execution(setup):
np.testing.assert_almost_equal(expected[1], result[1])


@pytest.mark.parametrize('chunk_size', [5, 15])
@pytest.mark.parametrize('mode', ['auto', 'less', 'greater', 'asymp', 'approx'])
def test_ks_1samp(setup, chunk_size, mode):
x = tensor(np.linspace(-15, 15, 9), chunk_size=5)

if mode == 'auto':
result = ks_1samp(x, sp_norm.cdf,
alternative='greater').execute().fetch()
expected = sp_ks_1samp(x, sp_norm.cdf,
alternative='greater')
assert result == expected

result = ks_1samp(x, sp_norm.cdf,
alternative='less').execute().fetch()
expected = sp_ks_1samp(x, sp_norm.cdf,
alternative='less')
assert result == expected

result = ks_1samp(x, sp_norm.cdf, mode=mode).execute().fetch()
expected = sp_ks_1samp(x, sp_norm.cdf, mode=mode)
assert result == expected

with pytest.raises(ValueError):
ks_1samp(x, sp_norm.cdf, alternative='unknown')


@pytest.mark.parametrize('chunk_size', [5, 15])
def test_ks_2samp(setup, chunk_size):
n1 = 10
Expand Down
21 changes: 8 additions & 13 deletions mars/tensor/stats/ttest.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,14 @@
from collections import namedtuple

import numpy as np
try:
from scipy import __version__ as sp_version
from scipy.stats import (
ttest_ind as sp_ttest_ind,
ttest_ind_from_stats as sp_ttest_ind_from_stats,
ttest_rel as sp_ttest_rel,
ttest_1samp as sp_ttest_1samp,
)
from scipy.stats import distributions as sp_distributions
except ImportError:
sp_version = None
sp_ttest_1samp = sp_ttest_ind = sp_ttest_ind_from_stats = sp_ttest_rel = None
sp_distributions = None
from scipy import __version__ as sp_version
from scipy.stats import (
ttest_ind as sp_ttest_ind,
ttest_ind_from_stats as sp_ttest_ind_from_stats,
ttest_rel as sp_ttest_rel,
ttest_1samp as sp_ttest_1samp,
)
from scipy.stats import distributions as sp_distributions

from ...core import ExecutableTuple
from ...lib.version import parse as parse_version
Expand Down

0 comments on commit 9c2da51

Please sign in to comment.