From 29311137ebb88dd6a2bc5109537254c658b3e36c Mon Sep 17 00:00:00 2001 From: "wenjun.swj" Date: Fri, 13 Aug 2021 19:23:02 +0800 Subject: [PATCH] Implements mt.stats.ks_1samp --- .github/workflows/cancel-prev.yml | 2 +- docs/source/reference/tensor/statistics.rst | 1 + mars/tensor/stats/__init__.py | 2 +- mars/tensor/stats/core.py | 18 -- mars/tensor/stats/{ks_2samp.py => ks.py} | 193 +++++++++++++++++- mars/tensor/stats/tests/test_stats_execute.py | 31 ++- mars/tensor/stats/ttest.py | 21 +- 7 files changed, 228 insertions(+), 40 deletions(-) delete mode 100644 mars/tensor/stats/core.py rename mars/tensor/stats/{ks_2samp.py => ks.py} (70%) diff --git a/.github/workflows/cancel-prev.yml b/.github/workflows/cancel-prev.yml index f9a671e8c2..7a6a2cbed6 100644 --- a/.github/workflows/cancel-prev.yml +++ b/.github/workflows/cancel-prev.yml @@ -6,7 +6,7 @@ jobs: cancel: runs-on: ubuntu-latest steps: - - uses: styfle/cancel-workflow-action@0.8.0 + - uses: styfle/cancel-workflow-action@0.9.1 with: access_token: ${{ github.token }} workflow_id: core-ci.yml,os-compat-ci.yml,platform-ci.yml,checks.yml diff --git a/docs/source/reference/tensor/statistics.rst b/docs/source/reference/tensor/statistics.rst index adc92e0331..8c9d0f44c0 100644 --- a/docs/source/reference/tensor/statistics.rst +++ b/docs/source/reference/tensor/statistics.rst @@ -62,6 +62,7 @@ Statistical tests :nosignatures: mars.tensor.stats.chisquare + mars.tensor.stats.ks_1samp mars.tensor.stats.ks_2samp mars.tensor.stats.power_divergence mars.tensor.stats.ttest_1samp diff --git a/mars/tensor/stats/__init__.py b/mars/tensor/stats/__init__.py index 4a4a826b39..e55b0ff7c6 100644 --- a/mars/tensor/stats/__init__.py +++ b/mars/tensor/stats/__init__.py @@ -14,6 +14,6 @@ from .entropy import entropy from .chisquare import chisquare +from .ks import ks_1samp, ks_2samp from .power_divergence import power_divergence from .ttest import ttest_1samp, ttest_ind, ttest_ind_from_stats, ttest_rel -from .ks_2samp import ks_2samp diff --git a/mars/tensor/stats/core.py b/mars/tensor/stats/core.py deleted file mode 100644 index f743b78f29..0000000000 --- a/mars/tensor/stats/core.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright 1999-2021 Alibaba Group Holding Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from collections import namedtuple - - -KstestResult = namedtuple('KstestResult', ('statistic', 'pvalue')) diff --git a/mars/tensor/stats/ks_2samp.py b/mars/tensor/stats/ks.py similarity index 70% rename from mars/tensor/stats/ks_2samp.py rename to mars/tensor/stats/ks.py index ab6daec8cb..6f5913f1dd 100644 --- a/mars/tensor/stats/ks_2samp.py +++ b/mars/tensor/stats/ks.py @@ -15,7 +15,8 @@ import math import warnings from math import gcd -from typing import Union +from collections import namedtuple +from typing import Callable, Tuple, Union import numpy as np from scipy import special @@ -24,9 +25,9 @@ from ... import tensor as mt from ...core import ExecutableTuple from ...typing import TileableType -from .core import KstestResult +KstestResult = namedtuple('KstestResult', ('statistic', 'pvalue')) Ks_2sampResult = KstestResult @@ -300,7 +301,7 @@ def _attempt_exact_2kssamp(n1, n2, g, d, alternative): # pragma: no cover return True, d, prob -def _calc_prob(d, n1, n2, alternative, mode): # pragma: no cover +def _calc_prob_2samp(d, n1, n2, alternative, mode): # pragma: no cover MAX_AUTO_N = 10000 # 'auto' will attempt to be exact if n1,n2 <= MAX_AUTO_N g = gcd(n1, n2) @@ -327,7 +328,7 @@ def _calc_prob(d, n1, n2, alternative, mode): # pragma: no cover f"Switching to mode={mode}.", RuntimeWarning) if mode == 'asymp': - # The product n1*n2 is large. Use Smirnov's asymptoptic formula. + # The product n1*n2 is large. Use Smirnov's asymptotic formula. # Ensure float to avoid overflow in multiplication # sorted because the one-sided formula is not symmetric in n1, n2 m, n = sorted([float(n1), float(n2)], reverse=True) @@ -344,6 +345,188 @@ def _calc_prob(d, n1, n2, alternative, mode): # pragma: no cover return np.clip(prob, 0, 1) +def _compute_dplus(cdfvals, n): + """Computes D+ as used in the Kolmogorov-Smirnov test. + Parameters + ---------- + cdfvals: array_like + Sorted array of CDF values between 0 and 1 + Returns + ------- + Maximum distance of the CDF values below Uniform(0, 1) + """ + return (mt.arange(1.0, n + 1) / n - cdfvals).max() + + +def _compute_dminus(cdfvals, n): + """Computes D- as used in the Kolmogorov-Smirnov test. + Parameters + ---------- + cdfvals: array_like + Sorted array of CDF values between 0 and 1 + Returns + ------- + Maximum distance of the CDF values above Uniform(0, 1) + """ + return (cdfvals - mt.arange(0.0, n) / n).max() + + +def ks_1samp(x: Union[np.ndarray, list, TileableType], + cdf: Callable, + args: Tuple = (), + alternative: str = 'two-sided', + mode: str = 'auto'): + """ + Performs the one-sample Kolmogorov-Smirnov test for goodness of fit. + + This test compares the underlying distribution F(x) of a sample + against a given continuous distribution G(x). See Notes for a description + of the available null and alternative hypotheses. + + Parameters + ---------- + x : array_like + a 1-D array of observations of iid random variables. + cdf : callable + callable used to calculate the cdf. + args : tuple, sequence, optional + Distribution parameters, used with `cdf`. + alternative : {'two-sided', 'less', 'greater'}, optional + Defines the null and alternative hypotheses. Default is 'two-sided'. + Please see explanations in the Notes below. + mode : {'auto', 'exact', 'approx', 'asymp'}, optional + Defines the distribution used for calculating the p-value. + The following options are available (default is 'auto'): + + * 'auto' : selects one of the other options. + * 'exact' : uses the exact distribution of test statistic. + * 'approx' : approximates the two-sided probability with twice + the one-sided probability + * 'asymp': uses asymptotic distribution of test statistic + + Returns + ------- + statistic : float + KS test statistic, either D, D+ or D- (depending on the value + of 'alternative') + pvalue : float + One-tailed or two-tailed p-value. + + See Also + -------- + ks_2samp, kstest + + Notes + ----- + There are three options for the null and corresponding alternative + hypothesis that can be selected using the `alternative` parameter. + + - `two-sided`: The null hypothesis is that the two distributions are + identical, F(x)=G(x) for all x; the alternative is that they are not + identical. + + - `less`: The null hypothesis is that F(x) >= G(x) for all x; the + alternative is that F(x) < G(x) for at least one x. + + - `greater`: The null hypothesis is that F(x) <= G(x) for all x; the + alternative is that F(x) > G(x) for at least one x. + + Note that the alternative hypotheses describe the *CDFs* of the + underlying distributions, not the observed values. For example, + suppose x1 ~ F and x2 ~ G. If F(x) > G(x) for all x, the values in + x1 tend to be less than those in x2. + + Examples + -------- + >>> import numpy as np + >>> from scipy import stats + >>> import mars.tensor as mt + >>> from mars.tensor.stats import ks_1samp + + >>> np.random.seed(12345678) #fix random seed to get the same result + >>> x = mt.linspace(-15, 15, 9, chunk_size=5) + >>> ks_1samp(x, stats.norm.cdf).execute() + (0.44435602715924361, 0.038850142705171065) + + >>> ks_1samp(stats.norm.rvs(size=100), stats.norm.cdf).execute() + KstestResult(statistic=0.165471391799..., pvalue=0.007331283245...) + + *Test against one-sided alternative hypothesis* + + Shift distribution to larger values, so that `` CDF(x) < norm.cdf(x)``: + + >>> x = stats.norm.rvs(loc=0.2, size=100) + >>> ks_1samp(x, stats.norm.cdf, alternative='less').execute() + KstestResult(statistic=0.235488541678..., pvalue=1.158315030683...) + + Reject null hypothesis in favor of alternative hypothesis: less + + >>> ks_1samp(x, stats.norm.cdf, alternative='greater').execute() + KstestResult(statistic=0.010167165616..., pvalue=0.972494973653...) + + Reject null hypothesis in favor of alternative hypothesis: greater + + >>> ks_1samp(x, stats.norm.cdf).execute() + KstestResult(statistic=0.235488541678..., pvalue=2.316630061366...) + + Don't reject null hypothesis in favor of alternative hypothesis: two-sided + + *Testing t distributed random variables against normal distribution* + + With 100 degrees of freedom the t distribution looks close to the normal + distribution, and the K-S test does not reject the hypothesis that the + sample came from the normal distribution: + + >>> ks_1samp(stats.t.rvs(100, size=100), stats.norm.cdf).execute() + KstestResult(statistic=0.077844250253..., pvalue=0.553155412513...) + + With 3 degrees of freedom the t distribution looks sufficiently different + from the normal distribution, that we can reject the hypothesis that the + sample came from the normal distribution at the 10% level: + + >>> ks_1samp(stats.t.rvs(3, size=100), stats.norm.cdf).execute() + KstestResult(statistic=0.118967105356..., pvalue=0.108627114578...) + """ + alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get( + alternative.lower()[0], alternative) + if alternative not in ['two-sided', 'greater', 'less']: + raise ValueError("Unexpected alternative %s" % alternative) + + x = mt.asarray(x) + N = x.shape[0] + x = mt.sort(x) + cdfvals = x.map_chunk(cdf, args=args, elementwise=True) + + if alternative == 'greater': + Dplus = _compute_dplus(cdfvals, N) + return ExecutableTuple(KstestResult( + Dplus, Dplus.map_chunk(distributions.ksone.sf, args=(N,)))) + + if alternative == 'less': + Dminus = _compute_dminus(cdfvals, N) + return ExecutableTuple(KstestResult( + Dminus, Dminus.map_chunk(distributions.ksone.sf, args=(N,)))) + + # alternative == 'two-sided': + Dplus = _compute_dplus(cdfvals, N) + Dminus = _compute_dminus(cdfvals, N) + D = mt.max([Dplus, Dminus]) + if mode == 'auto': # Always select exact + mode = 'exact' + if mode == 'exact': + prob = D.map_chunk(distributions.kstwo.sf, args=(N,), + elementwise=True) + elif mode == 'asymp': + prob = (D * np.sqrt(N)).map_chunk(distributions.kstwobign.sf, + elementwise=True) + else: + # mode == 'approx' + prob = 2 * D.map_chunk(distributions.ksone.sf, args=(N,), + elementwise=True) + prob = mt.clip(prob, 0, 1) + return ExecutableTuple(KstestResult(D, prob)) + + def ks_2samp(data1: Union[np.ndarray, list, TileableType], data2: Union[np.ndarray, list, TileableType], alternative: str = 'two-sided', @@ -477,7 +660,7 @@ def ks_2samp(data1: Union[np.ndarray, list, TileableType], maxS = mt.max(cddiffs) alt2Dvalue = {'less': minS, 'greater': maxS, 'two-sided': mt.maximum(minS, maxS)} d = alt2Dvalue[alternative] - prob = d.map_chunk(_calc_prob, args=(n1, n2, alternative, mode), + prob = d.map_chunk(_calc_prob_2samp, args=(n1, n2, alternative, mode), elementwise=True, dtype=d.dtype) return ExecutableTuple(Ks_2sampResult(d, prob)) diff --git a/mars/tensor/stats/tests/test_stats_execute.py b/mars/tensor/stats/tests/test_stats_execute.py index 7ca009a01e..1c2aeaea50 100644 --- a/mars/tensor/stats/tests/test_stats_execute.py +++ b/mars/tensor/stats/tests/test_stats_execute.py @@ -22,12 +22,13 @@ entropy as sp_entropy, power_divergence as sp_power_divergence, chisquare as sp_chisquare, + ks_1samp as sp_ks_1samp, ks_2samp as sp_ks_2samp, + norm as sp_norm, ttest_rel as sp_ttest_rel, ttest_ind as sp_ttest_ind, ttest_ind_from_stats as sp_ttest_ind_from_stats, ttest_1samp as sp_ttest_1samp, - norm as sp_norm ) from mars.lib.version import parse as parse_version @@ -35,7 +36,7 @@ from mars.tensor.stats import ( entropy, power_divergence, chisquare, ttest_ind, ttest_rel, ttest_1samp, ttest_ind_from_stats, - ks_2samp + ks_1samp, ks_2samp, ) @@ -197,6 +198,32 @@ def test_t_test_execution(setup): np.testing.assert_almost_equal(expected[1], result[1]) +@pytest.mark.parametrize('chunk_size', [5, 15]) +@pytest.mark.parametrize('mode', ['auto', 'less', 'greater', 'asymp', 'approx']) +def test_ks_1samp(setup, chunk_size, mode): + x = tensor(np.linspace(-15, 15, 9), chunk_size=5) + + if mode == 'auto': + result = ks_1samp(x, sp_norm.cdf, + alternative='greater').execute().fetch() + expected = sp_ks_1samp(x, sp_norm.cdf, + alternative='greater') + assert result == expected + + result = ks_1samp(x, sp_norm.cdf, + alternative='less').execute().fetch() + expected = sp_ks_1samp(x, sp_norm.cdf, + alternative='less') + assert result == expected + + result = ks_1samp(x, sp_norm.cdf, mode=mode).execute().fetch() + expected = sp_ks_1samp(x, sp_norm.cdf, mode=mode) + assert result == expected + + with pytest.raises(ValueError): + ks_1samp(x, sp_norm.cdf, alternative='unknown') + + @pytest.mark.parametrize('chunk_size', [5, 15]) def test_ks_2samp(setup, chunk_size): n1 = 10 diff --git a/mars/tensor/stats/ttest.py b/mars/tensor/stats/ttest.py index dbfa095e98..d9a3eacdb6 100644 --- a/mars/tensor/stats/ttest.py +++ b/mars/tensor/stats/ttest.py @@ -15,19 +15,14 @@ from collections import namedtuple import numpy as np -try: - from scipy import __version__ as sp_version - from scipy.stats import ( - ttest_ind as sp_ttest_ind, - ttest_ind_from_stats as sp_ttest_ind_from_stats, - ttest_rel as sp_ttest_rel, - ttest_1samp as sp_ttest_1samp, - ) - from scipy.stats import distributions as sp_distributions -except ImportError: - sp_version = None - sp_ttest_1samp = sp_ttest_ind = sp_ttest_ind_from_stats = sp_ttest_rel = None - sp_distributions = None +from scipy import __version__ as sp_version +from scipy.stats import ( + ttest_ind as sp_ttest_ind, + ttest_ind_from_stats as sp_ttest_ind_from_stats, + ttest_rel as sp_ttest_rel, + ttest_1samp as sp_ttest_1samp, +) +from scipy.stats import distributions as sp_distributions from ...core import ExecutableTuple from ...lib.version import parse as parse_version