From bdc3e4b99308e87146807860bb1b8c6701535c60 Mon Sep 17 00:00:00 2001 From: Sambit Panda <36676569+sampan501@users.noreply.github.com> Date: Wed, 13 Jan 2021 15:49:52 -0500 Subject: [PATCH] create wrapper class for Energy (#160) * sort imports * fix docs error short underline * add energy source code * fix tutorials not rendering * add more detail to compute distance and kernel --- docs/conf.py | 2 +- docs/reference/ksample.rst | 4 + docs/reference/tools.rst | 2 +- docs/requirements.txt | 2 +- docs/sphinxext/github_link.py | 4 +- docs/tutorials.rst | 7 +- docs/tutorials/independence/__init__.py | 2 +- hyppo/__init__.py | 2 +- hyppo/discrim/_utils.py | 8 +- hyppo/discrim/base.py | 1 + hyppo/discrim/discrim_one_samp.py | 5 +- hyppo/discrim/discrim_two_samp.py | 5 +- hyppo/discrim/tests/test_discrim_one_samp.py | 5 +- hyppo/discrim/tests/test_discrim_two_samp.py | 5 +- hyppo/independence/__init__.py | 6 +- hyppo/independence/_utils.py | 7 +- hyppo/independence/base.py | 27 ++- hyppo/independence/cca.py | 8 +- hyppo/independence/dcorr.py | 31 +++- hyppo/independence/hhg.py | 25 ++- hyppo/independence/hsic.py | 31 ++-- hyppo/independence/kmerf.py | 7 +- hyppo/independence/mgc.py | 26 ++- hyppo/independence/rv.py | 8 +- hyppo/independence/tests/test_cca.py | 2 +- hyppo/independence/tests/test_dcorr.py | 4 +- hyppo/independence/tests/test_hhg.py | 2 +- hyppo/independence/tests/test_hsic.py | 4 +- hyppo/independence/tests/test_kmerf.py | 4 +- hyppo/independence/tests/test_mgc.py | 4 +- hyppo/independence/tests/test_rvcorr.py | 4 +- hyppo/ksample/__init__.py | 1 + hyppo/ksample/_utils.py | 7 +- hyppo/ksample/base.py | 30 +++- hyppo/ksample/energy.py | 177 +++++++++++++++++++ hyppo/ksample/ksamp.py | 40 +++-- hyppo/ksample/tests/test_energy.py | 53 ++++++ hyppo/ksample/tests/test_ksamp.py | 4 +- hyppo/time_series/_utils.py | 6 +- hyppo/time_series/base.py | 28 ++- hyppo/time_series/dcorrx.py | 35 ++-- hyppo/time_series/mgcx.py | 36 ++-- hyppo/time_series/tests/test_dcorrx.py | 4 +- hyppo/time_series/tests/test_mgcx.py | 2 +- hyppo/tools/__init__.py | 2 +- hyppo/tools/common.py | 10 +- hyppo/tools/ksample_sim.py | 29 ++- hyppo/tools/tests/test_indep_sim.py | 30 ++-- hyppo/tools/tests/test_ksample_sim.py | 32 ++-- hyppo/tools/tests/test_utils.py | 7 +- setup.py | 3 +- 51 files changed, 577 insertions(+), 213 deletions(-) create mode 100644 hyppo/ksample/energy.py create mode 100644 hyppo/ksample/tests/test_energy.py diff --git a/docs/conf.py b/docs/conf.py index dac21b0ab..ab250301e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -13,8 +13,8 @@ # documentation root, use os.path.abspath to make it absolute, like shown here. # import os -import sys import shutil +import sys sys.path.insert(0, os.path.abspath("..")) sys.path.insert(0, os.path.abspath("sphinxext")) diff --git a/docs/reference/ksample.rst b/docs/reference/ksample.rst index c420b180a..bac5e55dd 100644 --- a/docs/reference/ksample.rst +++ b/docs/reference/ksample.rst @@ -6,3 +6,7 @@ Nonpar MANOVA via Independence Testing -------------------------------------- .. autoclass:: KSample + +Energy +------ +.. autoclass:: Energy diff --git a/docs/reference/tools.rst b/docs/reference/tools.rst index c177699c0..836f2ecb4 100644 --- a/docs/reference/tools.rst +++ b/docs/reference/tools.rst @@ -120,7 +120,7 @@ Misc ---- Kernel Matrix Computation -"""""""""""""""""" +""""""""""""""""""""""""" .. autofunction:: compute_kern Distance Matrix Computation diff --git a/docs/requirements.txt b/docs/requirements.txt index cc99e0408..645273e25 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,7 +1,7 @@ sphinx==1.8.5 sphinx_rtd_theme>=0.4.2 sphinxcontrib-rawfiles -nbsphinx>=0.4.2 +nbsphinx>=0.8.0 ipython>=7.4.0 ipykernel>=5.1.0 numpydoc==0.9.2 \ No newline at end of file diff --git a/docs/sphinxext/github_link.py b/docs/sphinxext/github_link.py index 3e40a35a6..2852d10db 100644 --- a/docs/sphinxext/github_link.py +++ b/docs/sphinxext/github_link.py @@ -1,9 +1,9 @@ -from operator import attrgetter import inspect -import subprocess import os +import subprocess import sys from functools import partial +from operator import attrgetter REVISION_CMD = "git rev-parse --short HEAD" diff --git a/docs/tutorials.rst b/docs/tutorials.rst index 9f61fdd74..013b1601d 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -1,7 +1,7 @@ Tutorials ********* -.. _tut_package: +.. _indep_tutorials: Independence Tests ------------------ @@ -25,6 +25,8 @@ the algorithms included against each other. tutorials/independence/indep_alg_speed +.. _ksamp_tutorials: + *K*-sample Tests ------------------ The *k*-sample testing problem is generalized as follows: consider random variables @@ -43,6 +45,7 @@ This tutorial overview how to use *k*-sample tests in ``hyppo``. tutorials/ksample/ksample +.. _ts_tutorials: Time-Series Tests ----------------- @@ -70,6 +73,8 @@ This tutorial overview how to use time_series based tests in ``hyppo``. tutorials/time_series/time_series +.. _sims_tutorials: + Sims ---- To evaluate existing implmentations and benchmark against other packages, diff --git a/docs/tutorials/independence/__init__.py b/docs/tutorials/independence/__init__.py index b248b409d..99c3e7ce9 100644 --- a/docs/tutorials/independence/__init__.py +++ b/docs/tutorials/independence/__init__.py @@ -1,3 +1,3 @@ -from .power import power_sample, power_dim +from .power import power_dim, power_sample __all__ = ["power_sample", "power_dim"] diff --git a/hyppo/__init__.py b/hyppo/__init__.py index 9ee8ae010..39c02fb0f 100644 --- a/hyppo/__init__.py +++ b/hyppo/__init__.py @@ -1,7 +1,7 @@ +import hyppo.discrim import hyppo.independence import hyppo.ksample import hyppo.time_series import hyppo.tools -import hyppo.discrim __version__ = "0.1.3" diff --git a/hyppo/discrim/_utils.py b/hyppo/discrim/_utils.py index b9b0fb29f..15f0f8d78 100644 --- a/hyppo/discrim/_utils.py +++ b/hyppo/discrim/_utils.py @@ -1,13 +1,7 @@ import numpy as np - from sklearn.metrics import pairwise_distances -from ..tools import ( - contains_nan, - check_ndarray_xy, - convert_xy_float64, - check_reps, -) +from ..tools import check_ndarray_xy, check_reps, contains_nan, convert_xy_float64 class _CheckInputs: diff --git a/hyppo/discrim/base.py b/hyppo/discrim/base.py index 8ac4655c6..a315bea0f 100644 --- a/hyppo/discrim/base.py +++ b/hyppo/discrim/base.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod + import numpy as np diff --git a/hyppo/discrim/discrim_one_samp.py b/hyppo/discrim/discrim_one_samp.py index 590b1d3a2..e35c4a8a9 100644 --- a/hyppo/discrim/discrim_one_samp.py +++ b/hyppo/discrim/discrim_one_samp.py @@ -1,8 +1,9 @@ -from ._utils import _CheckInputs import numpy as np -from .base import DiscriminabilityTest from scipy._lib._util import MapWrapper +from ._utils import _CheckInputs +from .base import DiscriminabilityTest + class DiscrimOneSample(DiscriminabilityTest): r""" diff --git a/hyppo/discrim/discrim_two_samp.py b/hyppo/discrim/discrim_two_samp.py index 405d880c8..60e6d14cb 100644 --- a/hyppo/discrim/discrim_two_samp.py +++ b/hyppo/discrim/discrim_two_samp.py @@ -1,9 +1,10 @@ -from ._utils import _CheckInputs import numpy as np from numba import njit -from .base import DiscriminabilityTest from scipy._lib._util import MapWrapper +from ._utils import _CheckInputs +from .base import DiscriminabilityTest + class DiscrimTwoSample(DiscriminabilityTest): r""" diff --git a/hyppo/discrim/tests/test_discrim_one_samp.py b/hyppo/discrim/tests/test_discrim_one_samp.py index ca657089e..d5ffcb4a0 100644 --- a/hyppo/discrim/tests/test_discrim_one_samp.py +++ b/hyppo/discrim/tests/test_discrim_one_samp.py @@ -1,6 +1,7 @@ -import pytest import numpy as np -from numpy.testing import assert_almost_equal, assert_warns, assert_raises +import pytest +from numpy.testing import assert_almost_equal, assert_raises, assert_warns + from .. import DiscrimOneSample diff --git a/hyppo/discrim/tests/test_discrim_two_samp.py b/hyppo/discrim/tests/test_discrim_two_samp.py index ad5bb4dba..aa162514e 100644 --- a/hyppo/discrim/tests/test_discrim_two_samp.py +++ b/hyppo/discrim/tests/test_discrim_two_samp.py @@ -1,6 +1,7 @@ -import pytest import numpy as np -from numpy.testing import assert_almost_equal, assert_warns, assert_raises +import pytest +from numpy.testing import assert_almost_equal, assert_raises, assert_warns + from .. import DiscrimTwoSample diff --git a/hyppo/independence/__init__.py b/hyppo/independence/__init__.py index 92912cd1a..bb7a0680f 100644 --- a/hyppo/independence/__init__.py +++ b/hyppo/independence/__init__.py @@ -1,9 +1,9 @@ -from .rv import RV from .cca import CCA -from .hhg import HHG from .dcorr import Dcorr +from .hhg import HHG from .hsic import Hsic -from .mgc import MGC from .kmerf import KMERF +from .mgc import MGC +from .rv import RV __all__ = [s for s in dir()] # add imported tests to __all__ diff --git a/hyppo/independence/_utils.py b/hyppo/independence/_utils.py index 397a9b80b..69d086336 100644 --- a/hyppo/independence/_utils.py +++ b/hyppo/independence/_utils.py @@ -1,11 +1,6 @@ import numpy as np -from ..tools import ( - contains_nan, - check_ndarray_xy, - convert_xy_float64, - check_reps, -) +from ..tools import check_ndarray_xy, check_reps, contains_nan, convert_xy_float64 class _CheckInputs: diff --git a/hyppo/independence/base.py b/hyppo/independence/base.py index 57068d1c0..b03ea505c 100644 --- a/hyppo/independence/base.py +++ b/hyppo/independence/base.py @@ -9,13 +9,26 @@ class IndependenceTest(ABC): Parameters ---------- - compute_distance : callable(), optional (default: euclidean) - A function that computes the distance or similarity among the samples - within each data matrix. Set to `None` if `x` and `y` are already - distance matrices. To call a custom function, either create the - distance matrix before-hand or create a function of the form - ``compute_distance(x)`` where `x` is the data matrix for which - pairwise distances are calculated. + compute_distance : callable(), optional (default: None) + A function that computes the distance among the samples within each + data matrix. + Valid strings for ``metric`` are, as defined in + ``sklearn.metrics.pairwise_distances``, + + - From scikit-learn: [‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, + ‘manhattan’] See the documentation for scipy.spatial.distance for details + on these metrics. + - From scipy.spatial.distance: [‘braycurtis’, ‘canberra’, ‘chebyshev’, + ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, ‘mahalanobis’, + ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, + ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’] See the + documentation for scipy.spatial.distance for details on these metrics. + + Set to `None` or `precomputed` if `x` and `y` are already distance + matrices. To call a custom function, either create the distance matrix + before-hand or create a function of the form ``metric(x, **kwargs)`` + where `x` is the data matrix for which pairwise distances are + calculated and kwargs are extra arguements to send to your custom function. """ def __init__(self, compute_distance=None, **kwargs): diff --git a/hyppo/independence/cca.py b/hyppo/independence/cca.py index a463e3d79..db8c60b6c 100644 --- a/hyppo/independence/cca.py +++ b/hyppo/independence/cca.py @@ -1,7 +1,7 @@ import numpy as np -from .base import IndependenceTest from ._utils import _CheckInputs +from .base import IndependenceTest class CCA(IndependenceTest): @@ -42,17 +42,17 @@ class CCA(IndependenceTest): References ---------- .. [#1CCA] Härdle, W. K., & Simar, L. (2015). Canonical correlation - analysis. In *Applied multivariate statistical analysis* (pp. + analysis. In Applied multivariate statistical analysis (pp. 443-454). Springer, Berlin, Heidelberg. .. [#2CCA] Knapp, T. R. (1978). Canonical correlation analysis: A general parametric significance-testing system. *Psychological Bulletin*, 85(2), 410. .. [#3CCA] Hotelling, H. (1992). Relations between two sets of variates. - In *Breakthroughs in statistics* (pp. 162-190). Springer, New + In Breakthroughs in statistics (pp. 162-190). Springer, New York, NY. .. [#4CCA] Hardoon, D. R., Szedmak, S., & Shawe-Taylor, J. (2004). Canonical correlation analysis: An overview with application to - learning methods. *Neural computation*, 16(12), 2639-2664. + learning methods. Neural computation, 16(12), 2639-2664. """ def __init__(self): diff --git a/hyppo/independence/dcorr.py b/hyppo/independence/dcorr.py index 9df0d7c0c..df0da694b 100644 --- a/hyppo/independence/dcorr.py +++ b/hyppo/independence/dcorr.py @@ -1,9 +1,9 @@ import numpy as np from numba import njit -from ..tools import compute_dist, chi2_approx, check_perm_blocks_dim -from .base import IndependenceTest +from ..tools import check_perm_blocks_dim, chi2_approx, compute_dist from ._utils import _CheckInputs +from .base import IndependenceTest class Dcorr(IndependenceTest): @@ -16,13 +16,26 @@ class Dcorr(IndependenceTest): Parameters ---------- - compute_distance : callable(), optional (default: euclidean) + compute_distance : callable(), optional (default: "euclidean") A function that computes the distance among the samples within each - data matrix. Set to `None` if `x` and `y` are already distance + data matrix. + Valid strings for ``metric`` are, as defined in + ``sklearn.metrics.pairwise_distances``, + + - From scikit-learn: [‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, + ‘manhattan’] See the documentation for scipy.spatial.distance for details + on these metrics. + - From scipy.spatial.distance: [‘braycurtis’, ‘canberra’, ‘chebyshev’, + ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, ‘mahalanobis’, + ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, + ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’] See the + documentation for scipy.spatial.distance for details on these metrics. + + Set to `None` or `precomputed` if `x` and `y` are already distance matrices. To call a custom function, either create the distance matrix - before-hand or create a function of the form ``compute_distance(x)`` + before-hand or create a function of the form ``metric(x, **kwargs)`` where `x` is the data matrix for which pairwise distances are - calculated. + calculated and kwargs are extra arguements to send to your custom function. bias : bool (default: False) Whether or not to use the biased or unbiased test statistics. @@ -89,10 +102,10 @@ class Dcorr(IndependenceTest): ---------- .. [#1Dcor] Székely, G. J., Rizzo, M. L., & Bakirov, N. K. (2007). Measuring and testing dependence by correlation of distances. - *The annals of statistics*, 35(6), 2769-2794. + The annals of statistics, 35(6), 2769-2794. .. [#2Dcor] Székely, G. J., & Rizzo, M. L. (2014). Partial distance - correlation with methods for dissimilarities. *The Annals of - Statistics*, 42(6), 2382-2412. + correlation with methods for dissimilarities. The Annals of + Statistics, 42(6), 2382-2412. """ def __init__(self, compute_distance="euclidean", bias=False, **kwargs): diff --git a/hyppo/independence/hhg.py b/hyppo/independence/hhg.py index 21e6c6b2c..63ad5795f 100644 --- a/hyppo/independence/hhg.py +++ b/hyppo/independence/hhg.py @@ -2,8 +2,8 @@ from numba import njit from ..tools import compute_dist -from .base import IndependenceTest from ._utils import _CheckInputs +from .base import IndependenceTest class HHG(IndependenceTest): @@ -18,13 +18,26 @@ class HHG(IndependenceTest): Parameters ---------- - compute_distance : callable(), optional (default: euclidean) + compute_distance : callable(), optional (default: "euclidean") A function that computes the distance among the samples within each - data matrix. Set to `None` if `x` and `y` are already distance + data matrix. + Valid strings for ``metric`` are, as defined in + ``sklearn.metrics.pairwise_distances``, + + - From scikit-learn: [‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, + ‘manhattan’] See the documentation for scipy.spatial.distance for details + on these metrics. + - From scipy.spatial.distance: [‘braycurtis’, ‘canberra’, ‘chebyshev’, + ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, ‘mahalanobis’, + ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, + ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’] See the + documentation for scipy.spatial.distance for details on these metrics. + + Set to `None` or `precomputed` if `x` and `y` are already distance matrices. To call a custom function, either create the distance matrix - before-hand or create a function of the form ``compute_distance(x)`` + before-hand or create a function of the form ``metric(x, **kwargs)`` where `x` is the data matrix for which pairwise distances are - calculated. + calculated and kwargs are extra arguements to send to your custom function. See Also -------- @@ -84,7 +97,7 @@ class HHG(IndependenceTest): ---------- .. [#1HHG] Heller, R., Heller, Y., & Gorfine, M. (2012). A consistent multivariate test of association based on ranks of distances. - *Biometrika*, 100(2), 503-510. + Biometrika, 100(2), 503-510. """ def __init__(self, compute_distance="euclidean", **kwargs): diff --git a/hyppo/independence/hsic.py b/hyppo/independence/hsic.py index fef4a6bcc..9c9d8d720 100644 --- a/hyppo/independence/hsic.py +++ b/hyppo/independence/hsic.py @@ -1,9 +1,9 @@ import numpy as np -from .base import IndependenceTest -from ._utils import _CheckInputs +from ..tools import chi2_approx, compute_kern from . import Dcorr -from ..tools import compute_kern, chi2_approx +from ._utils import _CheckInputs +from .base import IndependenceTest class Hsic(IndependenceTest): @@ -18,13 +18,20 @@ class Hsic(IndependenceTest): Parameters ---------- - compute_kernel : callable(), optional (default: rbf kernel) - A function that computes the similarity among the samples within each - data matrix. Set to `None` if `x` and `y` are already similarity + compute_kernel : callable(), optional (default: "gaussian") + A function that computes the kernel similarity among the samples within each + data matrix. + Valid strings for ``metric`` are, as defined in + ``sklearn.metrics.pairwise.pairwise_kernels``, + + ['additive_chi2', 'chi2', 'linear', 'poly', 'polynomial', 'gaussian', + 'laplacian', 'sigmoid', 'cosine'] + + Set to `None` or `precomputed` if `x` and `y` are already distance matrices. To call a custom function, either create the distance matrix - before-hand or create a function of the form ``compute_kernel(x)`` - where `x` is the data matrix for which pairwise similarties are - calculated. + before-hand or create a function of the form ``metric(x, **kwargs)`` + where `x` is the data matrix for which pairwise kernel similarity matrices are + calculated and kwargs are extra arguements to send to your custom function. bias : bool (default: False) Whether or not to use the biased or unbiased test statistics. @@ -91,10 +98,10 @@ class Hsic(IndependenceTest): ---------- .. [#1Hsic] Gretton, A., Fukumizu, K., Teo, C. H., Song, L., Schölkopf, B., & Smola, A. J. (2008). A kernel statistical test of - independence. In *Advances in neural information processing - systems* (pp. 585-592). + independence. In Advances in neural information processing + systems (pp. 585-592). .. [#2Hsic] Gretton, A., & GyĂśrfi, L. (2010). Consistent nonparametric - tests of independence. *Journal of Machine Learning Research*, + tests of independence. Journal of Machine Learning Research, 11(Apr), 1391-1423. """ diff --git a/hyppo/independence/kmerf.py b/hyppo/independence/kmerf.py index d7d0e276e..a7122eff8 100644 --- a/hyppo/independence/kmerf.py +++ b/hyppo/independence/kmerf.py @@ -1,11 +1,10 @@ import numpy as np -from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.metrics import pairwise_distances -from .base import IndependenceTest -from ._utils import _CheckInputs, sim_matrix from . import Dcorr - +from ._utils import _CheckInputs, sim_matrix +from .base import IndependenceTest FOREST_TYPES = { "classifier": RandomForestClassifier, diff --git a/hyppo/independence/mgc.py b/hyppo/independence/mgc.py index c7281a05f..1560f7ff6 100644 --- a/hyppo/independence/mgc.py +++ b/hyppo/independence/mgc.py @@ -1,9 +1,10 @@ import warnings + from scipy.stats import multiscale_graphcorr from ..tools import compute_dist -from .base import IndependenceTest from ._utils import _CheckInputs +from .base import IndependenceTest class MGC(IndependenceTest): @@ -29,13 +30,26 @@ class MGC(IndependenceTest): Parameters ---------- - compute_distance : callable(), optional (default: euclidean) + compute_distance : callable(), optional (default: "euclidean") A function that computes the distance among the samples within each - data matrix. Set to `None` if `x` and `y` are already distance + data matrix. + Valid strings for ``metric`` are, as defined in + ``sklearn.metrics.pairwise_distances``, + + - From scikit-learn: [‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, + ‘manhattan’] See the documentation for scipy.spatial.distance for details + on these metrics. + - From scipy.spatial.distance: [‘braycurtis’, ‘canberra’, ‘chebyshev’, + ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, ‘mahalanobis’, + ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, + ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’] See the + documentation for scipy.spatial.distance for details on these metrics. + + Set to `None` or `precomputed` if `x` and `y` are already distance matrices. To call a custom function, either create the distance matrix - before-hand or create a function of the form ``compute_distance(x)`` + before-hand or create a function of the form ``metric(x, **kwargs)`` where `x` is the data matrix for which pairwise distances are - calculated. + calculated and kwargs are extra arguements to send to your custom function. See Also -------- @@ -53,7 +67,7 @@ class MGC(IndependenceTest): the distance matrix of :math:`y`. :math:`D^x` and :math:`D^y` are modified to be mean zero columnwise. This results in two :math:`n \times n` distance matrices :math:`A` and :math:`B` (the - centering and unbiased modification) [3]_. + centering and unbiased modification) [#3MGC]_. + For all values :math:`k` and :math:`l` from :math:`1, ..., n`, diff --git a/hyppo/independence/rv.py b/hyppo/independence/rv.py index f4199fa7a..e8a131cd9 100644 --- a/hyppo/independence/rv.py +++ b/hyppo/independence/rv.py @@ -1,7 +1,7 @@ import numpy as np -from .base import IndependenceTest from ._utils import _CheckInputs +from .base import IndependenceTest class RV(IndependenceTest): @@ -44,11 +44,11 @@ class RV(IndependenceTest): References ---------- .. [#1RV] Robert, P., & Escoufier, Y. (1976). A unifying tool for linear - multivariate statistical methods: the RV‐coefficient. *Journal + multivariate statistical methods: the RV‐coefficient. Journal of the Royal Statistical Society: Series C (Applied - Statistics)*, 25(3), 257-265. + Statistics), 25(3), 257-265. .. [#2RV] Escoufier, Y. (1973). Le traitement des variables vectorielles. - *Biometrics*, 751-760. + Biometrics, 751-760. """ def __init__(self): diff --git a/hyppo/independence/tests/test_cca.py b/hyppo/independence/tests/test_cca.py index f1d7bf24b..0927f4baa 100644 --- a/hyppo/independence/tests/test_cca.py +++ b/hyppo/independence/tests/test_cca.py @@ -1,5 +1,5 @@ -import pytest import numpy as np +import pytest from numpy.testing import assert_almost_equal, assert_raises, assert_warns from ...tools import linear diff --git a/hyppo/independence/tests/test_dcorr.py b/hyppo/independence/tests/test_dcorr.py index 333ecb390..f5b34a018 100644 --- a/hyppo/independence/tests/test_dcorr.py +++ b/hyppo/independence/tests/test_dcorr.py @@ -1,6 +1,6 @@ -import pytest import numpy as np -from numpy.testing import assert_almost_equal, assert_warns, assert_raises +import pytest +from numpy.testing import assert_almost_equal, assert_raises, assert_warns from ...tools import linear from .. import Dcorr diff --git a/hyppo/independence/tests/test_hhg.py b/hyppo/independence/tests/test_hhg.py index a537644c6..b15b820c6 100644 --- a/hyppo/independence/tests/test_hhg.py +++ b/hyppo/independence/tests/test_hhg.py @@ -1,5 +1,5 @@ -import pytest import numpy as np +import pytest from numpy.testing import assert_almost_equal, assert_raises, assert_warns from ...tools import linear diff --git a/hyppo/independence/tests/test_hsic.py b/hyppo/independence/tests/test_hsic.py index b1ff19d65..d6270097f 100644 --- a/hyppo/independence/tests/test_hsic.py +++ b/hyppo/independence/tests/test_hsic.py @@ -1,6 +1,6 @@ -import pytest import numpy as np -from numpy.testing import assert_almost_equal, assert_warns, assert_raises +import pytest +from numpy.testing import assert_almost_equal, assert_raises, assert_warns from ...tools import linear from .. import Hsic diff --git a/hyppo/independence/tests/test_kmerf.py b/hyppo/independence/tests/test_kmerf.py index fd8ce73c9..1bb6e4019 100644 --- a/hyppo/independence/tests/test_kmerf.py +++ b/hyppo/independence/tests/test_kmerf.py @@ -1,8 +1,8 @@ -import pytest import numpy as np +import pytest from numpy.testing import assert_approx_equal -from ...tools import linear, spiral, multimodal_independence +from ...tools import linear, multimodal_independence, spiral from .. import KMERF diff --git a/hyppo/independence/tests/test_mgc.py b/hyppo/independence/tests/test_mgc.py index caa9601b6..0db90518e 100644 --- a/hyppo/independence/tests/test_mgc.py +++ b/hyppo/independence/tests/test_mgc.py @@ -1,8 +1,8 @@ -import pytest import numpy as np +import pytest from numpy.testing import assert_approx_equal -from ...tools import linear, spiral, multimodal_independence +from ...tools import linear, multimodal_independence, spiral from .. import MGC diff --git a/hyppo/independence/tests/test_rvcorr.py b/hyppo/independence/tests/test_rvcorr.py index 1a9d42456..1954c3a02 100644 --- a/hyppo/independence/tests/test_rvcorr.py +++ b/hyppo/independence/tests/test_rvcorr.py @@ -1,6 +1,6 @@ -import pytest import numpy as np -from numpy.testing import assert_almost_equal, assert_warns, assert_raises +import pytest +from numpy.testing import assert_almost_equal, assert_raises, assert_warns from ...tools import linear from .. import RV diff --git a/hyppo/ksample/__init__.py b/hyppo/ksample/__init__.py index bfc1d0335..7271b74d9 100644 --- a/hyppo/ksample/__init__.py +++ b/hyppo/ksample/__init__.py @@ -1,3 +1,4 @@ +from .energy import Energy from .ksamp import KSample __all__ = [s for s in dir()] # add imported tests to __all__ diff --git a/hyppo/ksample/_utils.py b/hyppo/ksample/_utils.py index 9cf82a64d..5536e0436 100644 --- a/hyppo/ksample/_utils.py +++ b/hyppo/ksample/_utils.py @@ -1,11 +1,10 @@ import numpy as np from ..tools import contains_nan -from ..independence import CCA, Dcorr, HHG, RV, Hsic, MGC, KMERF class _CheckInputs: - def __init__(self, inputs, indep_test, reps=None): + def __init__(self, inputs, indep_test=None, reps=None): self.inputs = inputs self.reps = reps self.indep_test = indep_test @@ -57,8 +56,8 @@ def _convert_inputs_float64(self): return [np.asarray(i).astype(np.float64) for i in self.inputs] def _check_indep_test(self): - tests = [CCA, Dcorr, HHG, RV, Hsic, MGC, KMERF] - if self.indep_test.__class__ not in tests and self.indep_test: + tests = ["cca", "dcorr", "hhg", "rv", "hsic", "mgc", "kmerf"] + if self.indep_test not in tests and self.indep_test is not None: raise ValueError("indep_test must be in {}".format(tests)) def _check_min_samples(self): diff --git a/hyppo/ksample/base.py b/hyppo/ksample/base.py index 7bd0c3710..4659cd620 100644 --- a/hyppo/ksample/base.py +++ b/hyppo/ksample/base.py @@ -7,23 +7,37 @@ class KSampleTest(ABC): Parameters ---------- - compute_distance : callable(), optional (default: euclidean) - A function that computes the distance or similarity among the samples - within each data matrix. Set to `None` if `x` and `y` are already - distance matrices. To call a custom function, either create the - distance matrix before-hand or create a function of the form - ``compute_distance(x)`` where `x` is the data matrix for which - pairwise distances are calculated. + compute_distance : callable(), optional (default: None) + A function that computes the distance among the samples within each + data matrix. + Valid strings for ``metric`` are, as defined in + ``sklearn.metrics.pairwise_distances``, + + - From scikit-learn: [‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, + ‘manhattan’] See the documentation for scipy.spatial.distance for details + on these metrics. + - From scipy.spatial.distance: [‘braycurtis’, ‘canberra’, ‘chebyshev’, + ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, ‘mahalanobis’, + ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, + ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’] See the + documentation for scipy.spatial.distance for details on these metrics. + + Set to `None` or `precomputed` if `x` and `y` are already distance + matrices. To call a custom function, either create the distance matrix + before-hand or create a function of the form ``metric(x, **kwargs)`` + where `x` is the data matrix for which pairwise distances are + calculated and kwargs are extra arguements to send to your custom function. bias : bool (default: False) Whether or not to use the biased or unbiased test statistics. Only applies to ``Dcorr`` and ``Hsic``. """ - def __init__(self, bias=False, **kwargs): + def __init__(self, compute_distance=None, bias=False, **kwargs): # set statistic and p-value self.stat = None self.pvalue = None self.bias = bias + self.compute_distance = compute_distance self.kwargs = kwargs super().__init__() diff --git a/hyppo/ksample/energy.py b/hyppo/ksample/energy.py new file mode 100644 index 000000000..f63c177fd --- /dev/null +++ b/hyppo/ksample/energy.py @@ -0,0 +1,177 @@ +import numpy as np +from numba import njit + +from ..independence.dcorr import _center_distmat +from ..tools import compute_dist +from ._utils import _CheckInputs +from .base import KSampleTest +from .ksamp import KSample + + +class Energy(KSampleTest): + r""" + Class for calculating the Energy test statistic and p-value. + + Energy is a powerful multivariate 2-sample test. It leverages distance matrix + capabilities (similar to tests like distance correlation or Dcorr). In fact, Energy + statistic is equivalent to our 2-sample formulation nonparametric MANOVa via + independence testing, i.e. ``hyppo.ksample.Ksample``, + and to Dcorr, Hilbert Schmidt Independence Criterion (Hsic), and Maximum Mean + Discrepancy [#1Ener]_ [#2Ener]_ (see "See Also" section for links). + + Parameters + ---------- + bias : bool (default: False) + Whether or not to use the biased or unbiased test statistics. + + Notes + ----- + Traditionally, the formulation for the 2-sample Energy statistic + is as follows [#3Ener]_: + + Define + :math:`\{ u_i \stackrel{iid}{\sim} F_U,\ i = 1, ..., n \}` and + :math:`\{ v_j \stackrel{iid}{\sim} F_V,\ j = 1, ..., m \}` as two groups + of samples deriving from different distributions with the same + dimensionality. If :math:`d(\cdot, \cdot)` is a distance metric (i.e. euclidean) + then, + + .. math:: + + Energy_{n, m}(\mathbf{u}, \mathbf{v}) = \frac{1}{n^2 m^2} + \left( 2nm \sum_{i = 1}^n \sum_{j = 1}^m d(u_i, v_j) - m^2 + \sum_{i,j=1}^n d(u_i, u_j) - n^2 \sum_{i, j=1}^m d(v_i, v_j) \right) + + The implementation in the ``hyppo.ksample.KSample`` class (using Dcorr) is in + fact equivalent to this implementation (for p-values) and statistics are + equivalent up to a scaling factor [#1Ener]_. + + The p-value returned is calculated using a permutation test using a + `permutation test `_. + The fast version of the test (for :math:`k`-sample Dcorr and Hsic) uses a + `chi squared approximation `_. + + References + ---------- + .. [#1Ener] Panda, S., Shen, C., Perry, R., Zorn, J., Lutz, A., Priebe, C. E., & + Vogelstein, J. T. (2019). Nonparametric MANOVA via Independence + Testing. arXiv e-prints, arXiv-1910. + .. [#2Ener] Shen, C., & Vogelstein, J. T. (2018). The exact equivalence of distance + and kernel methods for hypothesis testing. arXiv preprint + arXiv:1806.05514. + .. [#3Ener] Székely, G. J., & Rizzo, M. L. (2004). Testing for equal distributions + in high dimension. InterStat, 5(16.10), 1249-1272. + """ + + def __init__(self, compute_distance="euclidean", bias=False, **kwargs): + # set is_distance to true if compute_distance is None + self.is_distance = False + if not compute_distance: + self.is_distance = True + KSampleTest.__init__( + self, compute_distance=compute_distance, bias=bias, **kwargs + ) + + def _statistic(self, x, y): + r""" + Calulates the Energy test statistic. + + Parameters + ---------- + x, y : ndarray + Input data matrices. `x` and `y` must have the same number of + samples. That is, the shapes must be `(n, p)` and `(n, q)` where + `n` is the number of samples and `p` and `q` are the number of + dimensions. Alternatively, `x` and `y` can be distance matrices, + where the shapes must both be `(n, n)`. + """ + distx = x + disty = y + n = x.shape[0] + m = y.shape[0] + + if not self.is_distance: + distx, disty = compute_dist( + x, y, metric=self.compute_distance, **self.kwargs + ) + + # exact equivalence transformation Dcorr and Energy + stat = ( + _dcov(distx, disty, self.bias) * (2 * (n ** 2) * (m ** 2)) / ((n + m) ** 4) + ) + self.stat = stat + + return stat + + def test(self, x, y, reps=1000, workers=1, auto=True): + r""" + Calculates the Energy test statistic and p-value. + + Parameters + ---------- + x, y : ndarray + Input data matrices. `x` and `y` must have the same number of + samples. That is, the shapes must be `(n, p)` and `(n, q)` where + `n` is the number of samples and `p` and `q` are the number of + dimensions. Alternatively, `x` and `y` can be distance matrices, + where the shapes must both be `(n, n)`. + reps : int, optional (default: 1000) + The number of replications used to estimate the null distribution + when using the permutation test used to calculate the p-value. + workers : int, optional (default: 1) + The number of cores to parallelize the p-value computation over. + Supply -1 to use all cores available to the Process. + auto : bool (default: True) + Automatically uses fast approximation when sample size and size of array + is greater than 20. If True, and sample size is greater than 20, a fast + chi2 approximation will be run. Parameters ``reps`` and ``workers`` are + irrelevant in this case. + + Returns + ------- + stat : float + The computed *k*-Sample statistic. + pvalue : float + The computed *k*-Sample p-value. + + Examples + -------- + >>> import numpy as np + >>> from hyppo.ksample import Energy + >>> x = np.arange(7) + >>> y = x + >>> stat, pvalue = Energy().test(x, y) + >>> '%.3f, %.1f' % (stat, pvalue) + '0.000, 1.0' + """ + check_input = _CheckInputs( + inputs=[x, y], + indep_test="dcorr", + ) + x, y = check_input() + + # observed statistic + stat = Energy()._statistic(x, y) + + # since stat transformation is invariant under permutation, 2-sample Dcorr + # pvalue is identical to Energy + _, pvalue = KSample("Dcorr").test(x, y, reps=reps, workers=workers, auto=auto) + + return stat, pvalue + + +@njit +def _dcov(distx, disty, bias): # pragma: no cover + """Calculate the Dcorr test statistic""" + # center distance matrices + cent_distx = _center_distmat(distx, bias) + cent_disty = _center_distmat(disty, bias) + + N = distx.shape[0] + + if bias: + stat = 1 / (N ** 2) * np.trace(np.multiply(cent_distx, cent_disty)) + else: + stat = 1 / (N * (N - 3)) * np.trace(np.multiply(cent_distx, cent_disty)) + + return stat diff --git a/hyppo/ksample/ksamp.py b/hyppo/ksample/ksamp.py index 1b89bbe7b..b94588329 100644 --- a/hyppo/ksample/ksamp.py +++ b/hyppo/ksample/ksamp.py @@ -1,6 +1,6 @@ -from .base import KSampleTest -from ..independence import CCA, Dcorr, HHG, RV, Hsic, MGC, KMERF +from ..independence import CCA, HHG, KMERF, MGC, RV, Dcorr, Hsic from ._utils import _CheckInputs, k_sample_transform +from .base import KSampleTest class KSample(KSampleTest): @@ -18,13 +18,26 @@ class KSample(KSampleTest): indep_test : {"CCA", "Dcorr", "HHG", "RV", "Hsic", "MGC"} A string corresponding to the desired independence test from ``mgc.independence``. This is not case sensitive. - compute_distance : callable(), optional (default: euclidean) + compute_distance : callable(), optional (default: "euclidean") A function that computes the distance among the samples within each - data matrix. Set to `None` if `x` and `y` are already distance + data matrix. + Valid strings for ``metric`` are, as defined in + ``sklearn.metrics.pairwise_distances``, + + - From scikit-learn: [‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, + ‘manhattan’] See the documentation for scipy.spatial.distance for details + on these metrics. + - From scipy.spatial.distance: [‘braycurtis’, ‘canberra’, ‘chebyshev’, + ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, ‘mahalanobis’, + ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, + ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’] See the + documentation for scipy.spatial.distance for details on these metrics. + + Set to `None` or `precomputed` if `x` and `y` are already distance matrices. To call a custom function, either create the distance matrix - before-hand or create a function of the form ``compute_distance(x)`` + before-hand or create a function of the form ``metric(x, **kwargs)`` where `x` is the data matrix for which pairwise distances are - calculated. + calculated and kwargs are extra arguements to send to your custom function. bias : bool (default: False) Whether or not to use the biased or unbiased test statistics. Only applies to ``Dcorr`` and ``Hsic``. @@ -47,8 +60,8 @@ class KSample(KSampleTest): The closely related independence testing problem can be generalized similarly: Given a set of paired data - :math:`\{\left(x_i, y_i \right) \stackrel{iid}{\sim} F_{XY}, - \ i = 1, ..., N\}`, the problem that we are testing is, + :math:`\{\left(x_i, y_i \right) \stackrel{iid}{\sim} F_{XY}, \ i = 1, ..., N\}`, + the problem that we are testing is, .. math:: @@ -58,7 +71,7 @@ class KSample(KSampleTest): By manipulating the inputs of the *k*-sample test, we can create concatenated versions of the inputs and another label matrix which are necessarily paired. Then, any nonparametric test can be performed on - this data. That is, + this data. Letting :math:`n = \sum_{i=1}^k n_i`, define new data matrices :math:`\mathbf{x}` and :math:`\mathbf{y}` such that, @@ -122,7 +135,7 @@ class KSample(KSampleTest): proportional to how many labels (ways) samples differ by, a hierarchy of distances between samples thought to be true if the null hypothesis is rejected. - Performing a multilevel test involves constructing :math:x` and :math:`y` using + Performing a multilevel test involves constructing :math:`x` and :math:`y` using either of the methods above and then performing a block permutation [#2Ksamp]_. Essentially, the permutation is striated, where permutation is limited to be within a block of samples or between blocks of samples, but not both. This is done because @@ -180,6 +193,11 @@ def __init__(self, indep_test, compute_distance="euclidean", bias=False, **kwarg else: self.indep_test = indep_test() + # set is_distance to true if compute_distance is None + self.is_distance = False + if not compute_distance: + self.is_distance = True + KSampleTest.__init__( self, compute_distance=compute_distance, bias=bias, **kwargs ) @@ -250,7 +268,7 @@ def test(self, *args, reps=1000, workers=1, auto=True): inputs = list(args) check_input = _CheckInputs( inputs=inputs, - indep_test=self.indep_test, + indep_test=self.indep_test_name, ) inputs = check_input() if self.indep_test_name == "kmerf": diff --git a/hyppo/ksample/tests/test_energy.py b/hyppo/ksample/tests/test_energy.py new file mode 100644 index 000000000..3a634aeea --- /dev/null +++ b/hyppo/ksample/tests/test_energy.py @@ -0,0 +1,53 @@ +import numpy as np +import pytest +from numpy.testing import assert_almost_equal, assert_raises + +from ...tools import linear, rot_2samp +from .. import Energy + + +class TestEnergy: + @pytest.mark.parametrize( + "n, obs_stat, obs_pvalue", + [(200, 4.28e-7, 0.001), (100, 8.24e-5, 0.001)], + ) + def test_energy_linear_oned(self, n, obs_stat, obs_pvalue): + np.random.seed(123456789) + x, y = rot_2samp(linear, n, 1) + stat, pvalue = Energy().test(x, y, auto=False) + + assert_almost_equal(stat, obs_stat, decimal=1) + assert_almost_equal(pvalue, obs_pvalue, decimal=1) + + +class TestEnergyErrorWarn: + """Tests errors and warnings derived from MGC.""" + + def test_error_notndarray(self): + # raises error if x or y is not a ndarray + x = np.arange(20) + y = [5] * 20 + z = np.arange(5) + assert_raises(ValueError, Energy().test, x, y, z) + + def test_error_shape(self): + # raises error if number of samples different (n) + x = np.arange(100).reshape(25, 4) + y = x.reshape(10, 10) + z = x + assert_raises(ValueError, Energy().test, x, y, z) + + def test_error_lowsamples(self): + # raises error if samples are low (< 3) + x = np.arange(3) + y = np.arange(3) + assert_raises(ValueError, Energy().test, x, y) + + def test_error_nans(self): + # raises error if inputs contain NaNs + x = np.arange(20, dtype=float) + x[0] = np.nan + assert_raises(ValueError, Energy().test, x, x) + + y = np.arange(20) + assert_raises(ValueError, Energy().test, x, y) diff --git a/hyppo/ksample/tests/test_ksamp.py b/hyppo/ksample/tests/test_ksamp.py index 5cf8bb8d1..c4626537e 100644 --- a/hyppo/ksample/tests/test_ksamp.py +++ b/hyppo/ksample/tests/test_ksamp.py @@ -1,10 +1,10 @@ -import pytest import numpy as np +import pytest from numpy.testing import assert_almost_equal, assert_raises +from ...independence import CCA, Dcorr from ...tools import linear, rot_2samp from .. import KSample -from ...independence import CCA, Dcorr class TestKSample: diff --git a/hyppo/time_series/_utils.py b/hyppo/time_series/_utils.py index 2e64e1784..71282742c 100644 --- a/hyppo/time_series/_utils.py +++ b/hyppo/time_series/_utils.py @@ -2,14 +2,14 @@ import numpy as np +from ..independence import MGC from ..tools import ( - contains_nan, check_ndarray_xy, - convert_xy_float64, check_reps, compute_dist, + contains_nan, + convert_xy_float64, ) -from ..independence import MGC class _CheckInputs: diff --git a/hyppo/time_series/base.py b/hyppo/time_series/base.py index b318619b9..0f7480825 100644 --- a/hyppo/time_series/base.py +++ b/hyppo/time_series/base.py @@ -12,10 +12,26 @@ class TimeSeriesTest(ABC): Parameters ---------- - compute_distance : callable, optional - Function indicating distance metric (or alternatively the kernel) to - use. Calculates the pairwise distance for each input, by default - euclidean. + compute_distance : callable, optional (default: None) + A function that computes the distance among the samples within each + data matrix. + Valid strings for ``metric`` are, as defined in + ``sklearn.metrics.pairwise_distances``, + + - From scikit-learn: [‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, + ‘manhattan’] See the documentation for scipy.spatial.distance for details + on these metrics. + - From scipy.spatial.distance: [‘braycurtis’, ‘canberra’, ‘chebyshev’, + ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, ‘mahalanobis’, + ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, + ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’] See the + documentation for scipy.spatial.distance for details on these metrics. + + Set to `None` or `precomputed` if `x` and `y` are already distance + matrices. To call a custom function, either create the distance matrix + before-hand or create a function of the form ``metric(x, **kwargs)`` + where `x` is the data matrix for which pairwise distances are + calculated and kwargs are extra arguements to send to your custom function. Attributes ---------- @@ -29,7 +45,7 @@ class TimeSeriesTest(ABC): euclidean. """ - def __init__(self, compute_distance="euclidean", max_lag=0, **kwargs): + def __init__(self, compute_distance=None, max_lag=0, **kwargs): # set statistic and p-value self.stat = None self.pvalue = None @@ -37,8 +53,6 @@ def __init__(self, compute_distance="euclidean", max_lag=0, **kwargs): self.kwargs = kwargs # set compute_distance kernel - if not compute_distance: - compute_distance = None self.compute_distance = compute_distance super().__init__() diff --git a/hyppo/time_series/dcorrx.py b/hyppo/time_series/dcorrx.py index 272dc7f3a..b1d126a04 100644 --- a/hyppo/time_series/dcorrx.py +++ b/hyppo/time_series/dcorrx.py @@ -1,6 +1,6 @@ -from .base import TimeSeriesTest -from ._utils import _CheckInputs, compute_stat from ..independence import Dcorr +from ._utils import _CheckInputs, compute_stat +from .base import TimeSeriesTest class DcorrX(TimeSeriesTest): @@ -13,13 +13,26 @@ class DcorrX(TimeSeriesTest): Parameters ---------- - compute_distance : callable(), optional (default: euclidean) + compute_distance : callable(), optional (default: "euclidean") A function that computes the distance among the samples within each - data matrix. Set to `None` if `x` and `y` are already distance + data matrix. + Valid strings for ``metric`` are, as defined in + ``sklearn.metrics.pairwise_distances``, + + - From scikit-learn: [‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, + ‘manhattan’] See the documentation for scipy.spatial.distance for details + on these metrics. + - From scipy.spatial.distance: [‘braycurtis’, ‘canberra’, ‘chebyshev’, + ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, ‘mahalanobis’, + ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, + ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’] See the + documentation for scipy.spatial.distance for details on these metrics. + + Set to `None` or `precomputed` if `x` and `y` are already distance matrices. To call a custom function, either create the distance matrix - before-hand or create a function of the form ``compute_distance(x)`` + before-hand or create a function of the form ``metric(x, **kwargs)`` where `x` is the data matrix for which pairwise distances are - calculated. + calculated and kwargs are extra arguements to send to your custom function. max_lag : int, optional (default: 0) The maximum number of lags in the past to check dependence between `x` and the @@ -32,7 +45,7 @@ class DcorrX(TimeSeriesTest): Notes ----- - The statistic can be derived as follows: + The statistic can be derived as follows [#1DcorX]_: Let :math:`x` and :math:`y` be :math:`(n, p)` and :math:`(n, q)` series respectively, which each contain :math:`y` observations of the series @@ -43,14 +56,14 @@ class DcorrX(TimeSeriesTest): .. math:: - \mathrm{DcorrX}_n (x, y) = \sum_{j=0}^M frac{n-j}{n} - \mathrm{Dcorr}_n (x[j:n], y[0:(n-j)]) + DcorrX_n (x, y) = \sum_{j=0}^M \frac{n-j}{n} + Dcorr_n (x[j:n], y[0:(n-j)]) References ---------- .. [#1DcorX] Mehta, R., Chung, J., Shen C., Xu T., Vogelstein, J. T. (2019). - A Consistent Independence Test for Multivariate Time-Series. - *ArXiv* + A Consistent Independence Test for Multivariate Time-Series. + ArXiv """ def __init__(self, compute_distance="euclidean", max_lag=0, **kwargs): diff --git a/hyppo/time_series/mgcx.py b/hyppo/time_series/mgcx.py index 33fa0f37e..6caf03562 100644 --- a/hyppo/time_series/mgcx.py +++ b/hyppo/time_series/mgcx.py @@ -1,6 +1,6 @@ -from .base import TimeSeriesTest -from ._utils import _CheckInputs, compute_stat, compute_scale_at_lag from ..independence import MGC +from ._utils import _CheckInputs, compute_scale_at_lag, compute_stat +from .base import TimeSeriesTest class MGCX(TimeSeriesTest): @@ -13,14 +13,26 @@ class MGCX(TimeSeriesTest): Parameters ---------- - compute_distance : callable(), optional (default: euclidean) + compute_distance : callable(), optional (default: "euclidean") A function that computes the distance among the samples within each - data matrix. Set to `None` if `x` and `y` are already distance + data matrix. + Valid strings for ``metric`` are, as defined in + ``sklearn.metrics.pairwise_distances``, + + - From scikit-learn: [‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, + ‘manhattan’] See the documentation for scipy.spatial.distance for details + on these metrics. + - From scipy.spatial.distance: [‘braycurtis’, ‘canberra’, ‘chebyshev’, + ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, ‘mahalanobis’, + ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, + ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’] See the + documentation for scipy.spatial.distance for details on these metrics. + + Set to `None` or `precomputed` if `x` and `y` are already distance matrices. To call a custom function, either create the distance matrix - before-hand or create a function of the form ``compute_distance(x)`` + before-hand or create a function of the form ``metric(x, **kwargs)`` where `x` is the data matrix for which pairwise distances are - calculated. - + calculated and kwargs are extra arguements to send to your custom function. max_lag : int, optional (default: 0) The maximum number of lags in the past to check dependence between `x` and the shifted `y`. Also the :math:`M` hyperparmeter below. @@ -32,7 +44,7 @@ class MGCX(TimeSeriesTest): Notes ----- - The statistic can be derived as follows: + The statistic can be derived as follows [#1MgcX]_: Let :math:`x` and :math:`y` be :math:`(n, p)` and :math:`(n, q)` series respectively, which each contain :math:`y` observations of the series :math:`(X_t)` @@ -43,14 +55,14 @@ class MGCX(TimeSeriesTest): .. math:: - \mathrm{MGCX}_n (x, y) = \sum_{j=0}^M frac{n-j}{n} - \mathrm{MGC}_n (x[j:n], y[0:(n-j)]) + MGCX_n (x, y) = \sum_{j=0}^M \frac{n-j}{n} + MGC_n (x[j:n], y[0:(n-j)]) References ---------- - .. [#1DcorX] Mehta, R., Chung, J., Shen C., Xu T., Vogelstein, J. T. (2019). + .. [#1MgcX] Mehta, R., Chung, J., Shen C., Xu T., Vogelstein, J. T. (2019). A Consistent Independence Test for Multivariate Time-Series. - *ArXiv* + ArXiv """ def __init__(self, compute_distance="euclidean", max_lag=0, **kwargs): diff --git a/hyppo/time_series/tests/test_dcorrx.py b/hyppo/time_series/tests/test_dcorrx.py index f135ff0e4..32a40439a 100644 --- a/hyppo/time_series/tests/test_dcorrx.py +++ b/hyppo/time_series/tests/test_dcorrx.py @@ -1,10 +1,10 @@ -import pytest import numpy as np +import pytest from numpy.testing import assert_almost_equal from ...independence import Dcorr -from .. import DcorrX from ...tools import cross_corr_ar, nonlinear_process +from .. import DcorrX class TestDcorrXStat: diff --git a/hyppo/time_series/tests/test_mgcx.py b/hyppo/time_series/tests/test_mgcx.py index 172f60716..e6027faad 100644 --- a/hyppo/time_series/tests/test_mgcx.py +++ b/hyppo/time_series/tests/test_mgcx.py @@ -1,8 +1,8 @@ import numpy as np from numpy.testing import assert_almost_equal, assert_array_less -from .. import MGCX from ...tools import nonlinear_process +from .. import MGCX class TestMGCXStat: diff --git a/hyppo/tools/__init__.py b/hyppo/tools/__init__.py index f0be4fdd7..9b06288a8 100755 --- a/hyppo/tools/__init__.py +++ b/hyppo/tools/__init__.py @@ -1,6 +1,6 @@ +from .common import * from .indep_sim import * from .ksample_sim import * from .time_series_sim import * -from .common import * __all__ = [s for s in dir() if not s.startswith("_")] # remove dunders diff --git a/hyppo/tools/common.py b/hyppo/tools/common.py index 9787c8175..101098e01 100644 --- a/hyppo/tools/common.py +++ b/hyppo/tools/common.py @@ -1,7 +1,7 @@ import warnings -from joblib import Parallel, delayed import numpy as np +from joblib import Parallel, delayed from scipy.stats.distributions import chi2 from sklearn.metrics import pairwise_distances from sklearn.metrics.pairwise import pairwise_kernels @@ -129,7 +129,7 @@ def compute_kern(x, y, metric="gaussian", workers=1, **kwargs): dimensions. Alternatively, if `x` and `y` can be distance matrices, where the shapes must both be `(n, n)`, no kernel will be computed. metric : str, optional (default: "gaussian") - A function that computes the distance among the samples within each + A function that computes the kernel similarity among the samples within each data matrix. Valid strings for ``metric`` are, as defined in ``sklearn.metrics.pairwise.pairwise_kernels``, @@ -140,7 +140,7 @@ def compute_kern(x, y, metric="gaussian", workers=1, **kwargs): Set to `None` or `precomputed` if `x` and `y` are already distance matrices. To call a custom function, either create the distance matrix before-hand or create a function of the form ``metric(x, **kwargs)`` - where `x` is the data matrix for which pairwise distances are + where `x` is the data matrix for which pairwise kernel similarity matrices are calculated and kwargs are extra arguements to send to your custom function. workers : int, optional (default: 1) The number of cores to parallelize the p-value computation over. @@ -191,7 +191,7 @@ def compute_dist(x, y, metric="euclidean", workers=None, **kwargs): `n` is the number of samples and `p` and `q` are the number of dimensions. Alternatively, if `x` and `y` can be distance matrices, where the shapes must both be `(n, n)`, no kernel will be computed. - metric : str, optional (default: "gaussian") + metric : str, optional (default: "euclidean") A function that computes the distance among the samples within each data matrix. Valid strings for ``metric`` are, as defined in @@ -421,7 +421,7 @@ def perm_test(calc_stat, x, y, reps=1000, workers=1, is_distsim=True, perm_block Defines blocks of exchangeable samples during the permutation test. If None, all samples can be permuted with one another. Requires `n` rows. Constructs a tree graph with all samples initially at - the root node. Each column partitions samples from the same leaf with + the root node. Each column partitions samples from the same leaf with shared column label into a child of that leaf. During the permutation test, samples within the same final leaf node are exchangeable and blocks of samples with a common parent node are exchangeable. If a diff --git a/hyppo/tools/ksample_sim.py b/hyppo/tools/ksample_sim.py index 600d048a4..70e7fe647 100644 --- a/hyppo/tools/ksample_sim.py +++ b/hyppo/tools/ksample_sim.py @@ -1,29 +1,28 @@ import numpy as np from .indep_sim import ( - linear, - spiral, - exponential, + circle, cubic, + diamond, + ellipse, + exponential, + fourth_root, joint_normal, - step, - quadratic, - w_shaped, - uncorrelated_bernoulli, + linear, logarithmic, - fourth_root, + multimodal_independence, + multiplicative_noise, + quadratic, sin_four_pi, sin_sixteen_pi, - two_parabolas, - circle, - ellipse, - diamond, - multiplicative_noise, + spiral, square, - multimodal_independence, + step, + two_parabolas, + uncorrelated_bernoulli, + w_shaped, ) - _SIMS = [ linear, spiral, diff --git a/hyppo/tools/tests/test_indep_sim.py b/hyppo/tools/tests/test_indep_sim.py index 62101616c..23f8d608d 100644 --- a/hyppo/tools/tests/test_indep_sim.py +++ b/hyppo/tools/tests/test_indep_sim.py @@ -1,28 +1,28 @@ -import pytest import numpy as np +import pytest from numpy.testing import assert_equal from .. import ( - linear, - spiral, - exponential, + circle, cubic, + diamond, + ellipse, + exponential, + fourth_root, joint_normal, - step, - quadratic, - w_shaped, - uncorrelated_bernoulli, + linear, logarithmic, - fourth_root, + multimodal_independence, + multiplicative_noise, + quadratic, sin_four_pi, sin_sixteen_pi, - two_parabolas, - circle, - ellipse, - diamond, - multiplicative_noise, + spiral, square, - multimodal_independence, + step, + two_parabolas, + uncorrelated_bernoulli, + w_shaped, ) diff --git a/hyppo/tools/tests/test_ksample_sim.py b/hyppo/tools/tests/test_ksample_sim.py index df88d040d..d6e39aaba 100644 --- a/hyppo/tools/tests/test_ksample_sim.py +++ b/hyppo/tools/tests/test_ksample_sim.py @@ -1,30 +1,30 @@ -import pytest import numpy as np +import pytest from numpy.testing import assert_equal from .. import ( - linear, - spiral, - exponential, + circle, cubic, + diamond, + ellipse, + exponential, + fourth_root, joint_normal, - step, - quadratic, - w_shaped, - uncorrelated_bernoulli, + linear, logarithmic, - fourth_root, + multimodal_independence, + multiplicative_noise, + quadratic, + rot_2samp, sin_four_pi, sin_sixteen_pi, - two_parabolas, - circle, - ellipse, - diamond, - multiplicative_noise, + spiral, square, - multimodal_independence, - rot_2samp, + step, trans_2samp, + two_parabolas, + uncorrelated_bernoulli, + w_shaped, ) diff --git a/hyppo/tools/tests/test_utils.py b/hyppo/tools/tests/test_utils.py index 42261fef6..e48d84363 100644 --- a/hyppo/tools/tests/test_utils.py +++ b/hyppo/tools/tests/test_utils.py @@ -1,11 +1,12 @@ -from ..common import _PermTree +import numpy as np from numpy.testing import ( - assert_array_less, assert_allclose, + assert_array_less, assert_equal, assert_raises, ) -import numpy as np + +from ..common import _PermTree class TestPermTree: diff --git a/setup.py b/setup.py index bf6d29602..7a3998978 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,9 @@ import os import sys -from setuptools import setup, find_packages from sys import platform +from setuptools import find_packages, setup + PACKAGE_NAME = "hyppo" DESCRIPTION = "A comprehensive independence testing package" with open("README.md", "r") as f: