Add Fast kmerf (#369)

* make kmerf fast * use kernel on y as well * update kmerf kernel * remove weird import * fix k-sample-transform rf * make error trace better * fix ksample rf k-sample-transform * cast v to int for k-sample * fix kmerf p-values * update kmerf k-sample * fix kmerf unit tests * fix build errors * remove is_ksamp after saving the values
neurodata · May 24, 2023 · f028f44 · f028f44
1 parent a6ab700
commit f028f44
Show file tree

Hide file tree

Showing 6 changed files with 108 additions and 18 deletions.
diff --git a/hyppo/independence/_utils.py b/hyppo/independence/_utils.py
@@ -50,7 +50,9 @@ def _check_nd_indeptest(self):
         ny, _ = self.y.shape
         if nx != ny:
             raise ValueError(
-                "Shape mismatch, x and y must have shape " "[n, p] and [n, q]."
+                "Shape mismatch, x and y must have shape [n, p] and [n, q], found shape {} and {}".format(
+                    self.x.shape, self.y.shape
+                )
             )
 
     def _check_min_samples(self):

diff --git a/hyppo/independence/kmerf.py b/hyppo/independence/kmerf.py
@@ -1,6 +1,7 @@
 from typing import NamedTuple
 
 import numpy as np
+from scipy.stats.distributions import chi2
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 from sklearn.metrics import pairwise_distances
 
@@ -36,6 +37,31 @@ class KMERF(IndependenceTest):
         ``test`` is categorial, use the "classifier" keyword.
     ntrees : int, default: 500
         The number of trees used in the random forest.
+    compute_distance : str, callable, or None, default: "euclidean"
+        A function that computes the distance among the samples for `y`.
+        Valid strings for ``compute_distance`` are, as defined in
+        :func:`sklearn.metrics.pairwise_distances`,
+
+            - From scikit-learn: [``"euclidean"``, ``"cityblock"``, ``"cosine"``,
+              ``"l1"``, ``"l2"``, ``"manhattan"``] See the documentation for
+              :mod:`scipy.spatial.distance` for details
+              on these metrics.
+            - From scipy.spatial.distance: [``"braycurtis"``, ``"canberra"``,
+              ``"chebyshev"``, ``"correlation"``, ``"dice"``, ``"hamming"``,
+              ``"jaccard"``, ``"kulsinski"``, ``"mahalanobis"``, ``"minkowski"``,
+              ``"rogerstanimoto"``, ``"russellrao"``, ``"seuclidean"``,
+              ``"sokalmichener"``, ``"sokalsneath"``, ``"sqeuclidean"``,
+              ``"yule"``] See the documentation for :mod:`scipy.spatial.distance` for
+              details on these metrics.
+
+        Set to ``None`` or ``"precomputed"`` if ``y`` is already a distance
+        matrices. To call a custom function, either create the distance matrix
+        before-hand or create a function of the form ``metric(x, **kwargs)``
+        where ``x`` is the data matrix for which pairwise distances are
+        calculated and ``**kwargs`` are extra arguements to send to your custom
+        function.
+    distance_kwargs : dict
+        Arbitrary keyword arguments for ``compute_distance``.
     **kwargs
         Additional arguments used for the forest (see
         :class:`sklearn.ensemble.RandomForestClassifier` or
@@ -98,12 +124,28 @@ class KMERF(IndependenceTest):
     .. footbibliography::
     """
 
-    def __init__(self, forest="regressor", ntrees=500, **kwargs):
+    def __init__(
+        self,
+        forest="regressor",
+        ntrees=500,
+        compute_distance="euclidean",
+        distance_kwargs={},
+        **kwargs
+    ):
+        self.is_distance = False
+        self.distance_kwargs = distance_kwargs
+        if not compute_distance:
+            self.is_distance = True
+        self.is_ksamp = False
+        if "is_ksamp" in kwargs.keys():
+            self.is_ksamp = True
+            self.k_sample_transform = kwargs["is_ksamp"]
+            del kwargs["is_ksamp"]
         if forest in FOREST_TYPES.keys():
             self.clf = FOREST_TYPES[forest](n_estimators=ntrees, **kwargs)
         else:
             raise ValueError("Forest must be of type classification or regression")
-        IndependenceTest.__init__(self)
+        IndependenceTest.__init__(self, compute_distance=compute_distance)
 
     def statistic(self, x, y):
         r"""
@@ -122,11 +164,20 @@ def statistic(self, x, y):
         stat : float
             The computed KMERF statistic.
         """
-        y = y.reshape(-1)
-        self.clf.fit(x, y)
+        rf_y = y
+        if rf_y.shape[1] == 1:
+            rf_y = rf_y.ravel()
+        self.clf.fit(x, rf_y)
+
         distx = np.sqrt(1 - sim_matrix(self.clf, x))
-        y = y.reshape(-1, 1)
-        disty = pairwise_distances(y, metric="euclidean")
+        # can use induced kernel if shapes are the same, otherwise
+        # default to compute_distance
+        if x.shape[1] == y.shape[1]:
+            disty = np.sqrt(1 - sim_matrix(self.clf, y))
+        else:
+            disty = pairwise_distances(
+                y, metric=self.compute_distance, **self.distance_kwargs
+            )
         stat = _dcorr(distx, disty, bias=False, is_fast=False)
         self.stat = stat
 
@@ -135,7 +186,7 @@ def statistic(self, x, y):
 
         return stat
 
-    def test(self, x, y, reps=1000, workers=1, random_state=None):
+    def test(self, x, y, reps=1000, workers=1, auto=True, random_state=None):
         r"""
         Calculates the KMERF test statistic and p-value.
 
@@ -152,6 +203,13 @@ def test(self, x, y, reps=1000, workers=1, random_state=None):
         workers : int, default: 1
             The number of cores to parallelize the p-value computation over.
             Supply ``-1`` to use all cores available to the Process.
+        auto : bool, default: True
+            Automatically uses fast approximation when `n` and size of array
+            is greater than 20. If ``True``, and sample size is greater than 20, then
+            :class:`hyppo.tools.chi2_approx` will be run. Parameters ``reps`` and
+            ``workers`` are
+            irrelevant in this case. Otherwise, :class:`hyppo.tools.perm_test` will be
+            run.
 
         Returns
         -------
@@ -177,9 +235,26 @@ def test(self, x, y, reps=1000, workers=1, random_state=None):
         check_input = _CheckInputs(x, y, reps=reps)
         x, y = check_input()
 
-        stat, pvalue = super(KMERF, self).test(
-            x, y, reps, workers, is_distsim=False, random_state=random_state
-        )
+        if auto and x.shape[0] > 20:
+            n = x.shape[0]
+            stat = self.statistic(x, y)
+            if self.is_ksamp:
+                xs = self.k_sample_transform([x, x], test_type="rf")
+                ys = self.k_sample_transform([y, y], test_type="rf")
+            else:
+                xs = (x, x)
+                ys = (y, y)
+            statx = self.statistic(*xs)
+            staty = self.statistic(*ys)
+            pvalue = chi2.sf(stat / np.sqrt(statx * staty) * n + 1, 1)
+            # pvalue = chi2.sf(stat * n + 1, 1)
+            self.stat = stat
+            self.pvalue = pvalue
+            self.null_dist = None
+        else:
+            stat, pvalue = super(KMERF, self).test(
+                x, y, reps, workers, is_distsim=False, random_state=random_state
+            )
         kmerf_dict = {"feat_importance": self.importances}
 
         return KMERFTestOutput(stat, pvalue, kmerf_dict)
diff --git a/hyppo/independence/tests/test_kmerf.py b/hyppo/independence/tests/test_kmerf.py
@@ -13,9 +13,17 @@ class TestKMERFStat(object):
     @pytest.mark.parametrize(
         "sim, obs_stat, obs_pvalue",
         [
-            (linear, 0.253, 1.0),  # test linear simulation
-            (spiral, 0.037, 1.0),  # test spiral simulation
-            (multimodal_independence, -0.0363, 1.0),  # test independence simulation
+            (linear, 1.0, 9.19834440770024e-24),  # test linear simulation
+            (
+                spiral,
+                0.1835550720881665,
+                1.3087565060526607e-05,
+            ),  # test spiral simulation
+            (
+                multimodal_independence,
+                -0.0059788,
+                0.5390103019274002,
+            ),  # test independence simulation
         ],
     )
     def test_oned(self, sim, obs_stat, obs_pvalue):

diff --git a/hyppo/ksample/_utils.py b/hyppo/ksample/_utils.py
@@ -49,7 +49,8 @@ def check_dim(self):
     def _check_nd_ksampletest(self, dims):
         if len(set(dims)) > 1:
             raise ValueError(
-                "Shape mismatch, inputs must have shape " "[n, p] and [m, p]."
+                "Shape mismatch, inputs must have shape "
+                "[n, p] and [m, p]. Found shapes {}".format(list(set(dims)))
             )
 
     def _convert_inputs_float64(self):
@@ -103,8 +104,8 @@ def k_sample_transform(inputs, test_type="normal"):
         raise ValueError("Test cannot be run, the inputs have 0 variance")
 
     if test_type == "rf":
-        v = np.concatenate(
-            [np.repeat(i, inputs[i].shape[0]) for i in range(n_inputs)], axis=0
+        v = np.vstack(
+            [np.repeat(i, inputs[i].shape[0]).reshape(-1, 1) for i in range(n_inputs)]
         )
     elif test_type == "normal":
         if n_inputs == 2:

diff --git a/hyppo/ksample/ksamp.py b/hyppo/ksample/ksamp.py
@@ -184,7 +184,7 @@ def __init__(self, indep_test, compute_distkern="euclidean", bias=False, **kwarg
             "hsic": {"bias": bias, "compute_kernel": compute_distkern},
             "hhg": {"compute_distance": compute_distkern},
             "mgc": {"compute_distance": compute_distkern},
-            "kmerf": {"forest": "classifier"},
+            "kmerf": {"forest": "classifier", "is_ksamp": k_sample_transform},
             "rv": {},
             "cca": {},
         }
@@ -293,10 +293,13 @@ def test(self, *args, reps=1000, workers=1, auto=True, random_state=None):
             u, v = k_sample_transform(inputs, test_type="rf")
         else:
             u, v = k_sample_transform(inputs)
+        v = v.astype(int)
 
         kwargs = {}
         if self.indep_test_name in ["dcorr", "hsic"]:
             kwargs = {"auto": auto}
+        elif self.indep_test_name in ["kmerf"]:
+            kwargs = {"auto": auto}
 
         return self.indep_test.test(
             u, v, reps, workers, **kwargs, random_state=random_state

diff --git a/hyppo/tools/power.py b/hyppo/tools/power.py
@@ -24,6 +24,7 @@
     "disco": "fast",
     "manova": "analytical",
     "hotelling": "analytical",
+    "kmerf": "fast",
 }