Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BACKPORT] Implements mt.stats.ks_1samp (#2335) #2341

Merged
merged 1 commit into from
Aug 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/cancel-prev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ jobs:
cancel:
runs-on: ubuntu-latest
steps:
- uses: styfle/cancel-workflow-action@0.8.0
- uses: styfle/cancel-workflow-action@0.9.1
with:
access_token: ${{ github.token }}
workflow_id: core-ci.yml,os-compat-ci.yml,platform-ci.yml,checks.yml
1 change: 1 addition & 0 deletions docs/source/reference/tensor/statistics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ Statistical tests
:nosignatures:

mars.tensor.stats.chisquare
mars.tensor.stats.ks_1samp
mars.tensor.stats.ks_2samp
mars.tensor.stats.power_divergence
mars.tensor.stats.ttest_1samp
Expand Down
2 changes: 1 addition & 1 deletion mars/tensor/stats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@

from .entropy import entropy
from .chisquare import chisquare
from .ks import ks_1samp, ks_2samp
from .power_divergence import power_divergence
from .ttest import ttest_1samp, ttest_ind, ttest_ind_from_stats, ttest_rel
from .ks_2samp import ks_2samp
18 changes: 0 additions & 18 deletions mars/tensor/stats/core.py

This file was deleted.

193 changes: 188 additions & 5 deletions mars/tensor/stats/ks_2samp.py → mars/tensor/stats/ks.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
import math
import warnings
from math import gcd
from typing import Union
from collections import namedtuple
from typing import Callable, Tuple, Union

import numpy as np
from scipy import special
Expand All @@ -24,9 +25,9 @@
from ... import tensor as mt
from ...core import ExecutableTuple
from ...typing import TileableType
from .core import KstestResult


KstestResult = namedtuple('KstestResult', ('statistic', 'pvalue'))
Ks_2sampResult = KstestResult


Expand Down Expand Up @@ -300,7 +301,7 @@ def _attempt_exact_2kssamp(n1, n2, g, d, alternative): # pragma: no cover
return True, d, prob


def _calc_prob(d, n1, n2, alternative, mode): # pragma: no cover
def _calc_prob_2samp(d, n1, n2, alternative, mode): # pragma: no cover
MAX_AUTO_N = 10000 # 'auto' will attempt to be exact if n1,n2 <= MAX_AUTO_N

g = gcd(n1, n2)
Expand All @@ -327,7 +328,7 @@ def _calc_prob(d, n1, n2, alternative, mode): # pragma: no cover
f"Switching to mode={mode}.", RuntimeWarning)

if mode == 'asymp':
# The product n1*n2 is large. Use Smirnov's asymptoptic formula.
# The product n1*n2 is large. Use Smirnov's asymptotic formula.
# Ensure float to avoid overflow in multiplication
# sorted because the one-sided formula is not symmetric in n1, n2
m, n = sorted([float(n1), float(n2)], reverse=True)
Expand All @@ -344,6 +345,188 @@ def _calc_prob(d, n1, n2, alternative, mode): # pragma: no cover
return np.clip(prob, 0, 1)


def _compute_dplus(cdfvals, n):
"""Computes D+ as used in the Kolmogorov-Smirnov test.
Parameters
----------
cdfvals: array_like
Sorted array of CDF values between 0 and 1
Returns
-------
Maximum distance of the CDF values below Uniform(0, 1)
"""
return (mt.arange(1.0, n + 1) / n - cdfvals).max()


def _compute_dminus(cdfvals, n):
"""Computes D- as used in the Kolmogorov-Smirnov test.
Parameters
----------
cdfvals: array_like
Sorted array of CDF values between 0 and 1
Returns
-------
Maximum distance of the CDF values above Uniform(0, 1)
"""
return (cdfvals - mt.arange(0.0, n) / n).max()


def ks_1samp(x: Union[np.ndarray, list, TileableType],
cdf: Callable,
args: Tuple = (),
alternative: str = 'two-sided',
mode: str = 'auto'):
"""
Performs the one-sample Kolmogorov-Smirnov test for goodness of fit.

This test compares the underlying distribution F(x) of a sample
against a given continuous distribution G(x). See Notes for a description
of the available null and alternative hypotheses.

Parameters
----------
x : array_like
a 1-D array of observations of iid random variables.
cdf : callable
callable used to calculate the cdf.
args : tuple, sequence, optional
Distribution parameters, used with `cdf`.
alternative : {'two-sided', 'less', 'greater'}, optional
Defines the null and alternative hypotheses. Default is 'two-sided'.
Please see explanations in the Notes below.
mode : {'auto', 'exact', 'approx', 'asymp'}, optional
Defines the distribution used for calculating the p-value.
The following options are available (default is 'auto'):

* 'auto' : selects one of the other options.
* 'exact' : uses the exact distribution of test statistic.
* 'approx' : approximates the two-sided probability with twice
the one-sided probability
* 'asymp': uses asymptotic distribution of test statistic

Returns
-------
statistic : float
KS test statistic, either D, D+ or D- (depending on the value
of 'alternative')
pvalue : float
One-tailed or two-tailed p-value.

See Also
--------
ks_2samp, kstest

Notes
-----
There are three options for the null and corresponding alternative
hypothesis that can be selected using the `alternative` parameter.

- `two-sided`: The null hypothesis is that the two distributions are
identical, F(x)=G(x) for all x; the alternative is that they are not
identical.

- `less`: The null hypothesis is that F(x) >= G(x) for all x; the
alternative is that F(x) < G(x) for at least one x.

- `greater`: The null hypothesis is that F(x) <= G(x) for all x; the
alternative is that F(x) > G(x) for at least one x.

Note that the alternative hypotheses describe the *CDFs* of the
underlying distributions, not the observed values. For example,
suppose x1 ~ F and x2 ~ G. If F(x) > G(x) for all x, the values in
x1 tend to be less than those in x2.

Examples
--------
>>> import numpy as np
>>> from scipy import stats
>>> import mars.tensor as mt
>>> from mars.tensor.stats import ks_1samp

>>> np.random.seed(12345678) #fix random seed to get the same result
>>> x = mt.linspace(-15, 15, 9, chunk_size=5)
>>> ks_1samp(x, stats.norm.cdf).execute()
(0.44435602715924361, 0.038850142705171065)

>>> ks_1samp(stats.norm.rvs(size=100), stats.norm.cdf).execute()
KstestResult(statistic=0.165471391799..., pvalue=0.007331283245...)

*Test against one-sided alternative hypothesis*

Shift distribution to larger values, so that `` CDF(x) < norm.cdf(x)``:

>>> x = stats.norm.rvs(loc=0.2, size=100)
>>> ks_1samp(x, stats.norm.cdf, alternative='less').execute()
KstestResult(statistic=0.235488541678..., pvalue=1.158315030683...)

Reject null hypothesis in favor of alternative hypothesis: less

>>> ks_1samp(x, stats.norm.cdf, alternative='greater').execute()
KstestResult(statistic=0.010167165616..., pvalue=0.972494973653...)

Reject null hypothesis in favor of alternative hypothesis: greater

>>> ks_1samp(x, stats.norm.cdf).execute()
KstestResult(statistic=0.235488541678..., pvalue=2.316630061366...)

Don't reject null hypothesis in favor of alternative hypothesis: two-sided

*Testing t distributed random variables against normal distribution*

With 100 degrees of freedom the t distribution looks close to the normal
distribution, and the K-S test does not reject the hypothesis that the
sample came from the normal distribution:

>>> ks_1samp(stats.t.rvs(100, size=100), stats.norm.cdf).execute()
KstestResult(statistic=0.077844250253..., pvalue=0.553155412513...)

With 3 degrees of freedom the t distribution looks sufficiently different
from the normal distribution, that we can reject the hypothesis that the
sample came from the normal distribution at the 10% level:

>>> ks_1samp(stats.t.rvs(3, size=100), stats.norm.cdf).execute()
KstestResult(statistic=0.118967105356..., pvalue=0.108627114578...)
"""
alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get(
alternative.lower()[0], alternative)
if alternative not in ['two-sided', 'greater', 'less']:
raise ValueError("Unexpected alternative %s" % alternative)

x = mt.asarray(x)
N = x.shape[0]
x = mt.sort(x)
cdfvals = x.map_chunk(cdf, args=args, elementwise=True)

if alternative == 'greater':
Dplus = _compute_dplus(cdfvals, N)
return ExecutableTuple(KstestResult(
Dplus, Dplus.map_chunk(distributions.ksone.sf, args=(N,))))

if alternative == 'less':
Dminus = _compute_dminus(cdfvals, N)
return ExecutableTuple(KstestResult(
Dminus, Dminus.map_chunk(distributions.ksone.sf, args=(N,))))

# alternative == 'two-sided':
Dplus = _compute_dplus(cdfvals, N)
Dminus = _compute_dminus(cdfvals, N)
D = mt.max([Dplus, Dminus])
if mode == 'auto': # Always select exact
mode = 'exact'
if mode == 'exact':
prob = D.map_chunk(distributions.kstwo.sf, args=(N,),
elementwise=True)
elif mode == 'asymp':
prob = (D * np.sqrt(N)).map_chunk(distributions.kstwobign.sf,
elementwise=True)
else:
# mode == 'approx'
prob = 2 * D.map_chunk(distributions.ksone.sf, args=(N,),
elementwise=True)
prob = mt.clip(prob, 0, 1)
return ExecutableTuple(KstestResult(D, prob))


def ks_2samp(data1: Union[np.ndarray, list, TileableType],
data2: Union[np.ndarray, list, TileableType],
alternative: str = 'two-sided',
Expand Down Expand Up @@ -477,7 +660,7 @@ def ks_2samp(data1: Union[np.ndarray, list, TileableType],
maxS = mt.max(cddiffs)
alt2Dvalue = {'less': minS, 'greater': maxS, 'two-sided': mt.maximum(minS, maxS)}
d = alt2Dvalue[alternative]
prob = d.map_chunk(_calc_prob, args=(n1, n2, alternative, mode),
prob = d.map_chunk(_calc_prob_2samp, args=(n1, n2, alternative, mode),
elementwise=True, dtype=d.dtype)

return ExecutableTuple(Ks_2sampResult(d, prob))
31 changes: 29 additions & 2 deletions mars/tensor/stats/tests/test_stats_execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,21 @@
entropy as sp_entropy,
power_divergence as sp_power_divergence,
chisquare as sp_chisquare,
ks_1samp as sp_ks_1samp,
ks_2samp as sp_ks_2samp,
norm as sp_norm,
ttest_rel as sp_ttest_rel,
ttest_ind as sp_ttest_ind,
ttest_ind_from_stats as sp_ttest_ind_from_stats,
ttest_1samp as sp_ttest_1samp,
norm as sp_norm
)

from mars.lib.version import parse as parse_version
from mars.tensor import tensor
from mars.tensor.stats import (
entropy, power_divergence, chisquare,
ttest_ind, ttest_rel, ttest_1samp, ttest_ind_from_stats,
ks_2samp
ks_1samp, ks_2samp,
)


Expand Down Expand Up @@ -197,6 +198,32 @@ def test_t_test_execution(setup):
np.testing.assert_almost_equal(expected[1], result[1])


@pytest.mark.parametrize('chunk_size', [5, 15])
@pytest.mark.parametrize('mode', ['auto', 'less', 'greater', 'asymp', 'approx'])
def test_ks_1samp(setup, chunk_size, mode):
x = tensor(np.linspace(-15, 15, 9), chunk_size=5)

if mode == 'auto':
result = ks_1samp(x, sp_norm.cdf,
alternative='greater').execute().fetch()
expected = sp_ks_1samp(x, sp_norm.cdf,
alternative='greater')
assert result == expected

result = ks_1samp(x, sp_norm.cdf,
alternative='less').execute().fetch()
expected = sp_ks_1samp(x, sp_norm.cdf,
alternative='less')
assert result == expected

result = ks_1samp(x, sp_norm.cdf, mode=mode).execute().fetch()
expected = sp_ks_1samp(x, sp_norm.cdf, mode=mode)
assert result == expected

with pytest.raises(ValueError):
ks_1samp(x, sp_norm.cdf, alternative='unknown')


@pytest.mark.parametrize('chunk_size', [5, 15])
def test_ks_2samp(setup, chunk_size):
n1 = 10
Expand Down
21 changes: 8 additions & 13 deletions mars/tensor/stats/ttest.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,14 @@
from collections import namedtuple

import numpy as np
try:
from scipy import __version__ as sp_version
from scipy.stats import (
ttest_ind as sp_ttest_ind,
ttest_ind_from_stats as sp_ttest_ind_from_stats,
ttest_rel as sp_ttest_rel,
ttest_1samp as sp_ttest_1samp,
)
from scipy.stats import distributions as sp_distributions
except ImportError:
sp_version = None
sp_ttest_1samp = sp_ttest_ind = sp_ttest_ind_from_stats = sp_ttest_rel = None
sp_distributions = None
from scipy import __version__ as sp_version
from scipy.stats import (
ttest_ind as sp_ttest_ind,
ttest_ind_from_stats as sp_ttest_ind_from_stats,
ttest_rel as sp_ttest_rel,
ttest_1samp as sp_ttest_1samp,
)
from scipy.stats import distributions as sp_distributions

from ...core import ExecutableTuple
from ...lib.version import parse as parse_version
Expand Down