Skip to content

Commit

Permalink
Add Fast kmerf (#369)
Browse files Browse the repository at this point in the history
* make kmerf fast

* use kernel on y as well

* update kmerf kernel

* remove weird import

* fix k-sample-transform rf

* make error trace better

* fix ksample rf k-sample-transform

* cast v to int for k-sample

* fix kmerf p-values

* update kmerf k-sample

* fix kmerf unit tests

* fix build errors

* remove is_ksamp after saving the values
  • Loading branch information
sampan501 committed May 24, 2023
1 parent a6ab700 commit f028f44
Show file tree
Hide file tree
Showing 6 changed files with 108 additions and 18 deletions.
4 changes: 3 additions & 1 deletion hyppo/independence/_utils.py
Expand Up @@ -50,7 +50,9 @@ def _check_nd_indeptest(self):
ny, _ = self.y.shape
if nx != ny:
raise ValueError(
"Shape mismatch, x and y must have shape " "[n, p] and [n, q]."
"Shape mismatch, x and y must have shape [n, p] and [n, q], found shape {} and {}".format(
self.x.shape, self.y.shape
)
)

def _check_min_samples(self):
Expand Down
95 changes: 85 additions & 10 deletions hyppo/independence/kmerf.py
@@ -1,6 +1,7 @@
from typing import NamedTuple

import numpy as np
from scipy.stats.distributions import chi2
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import pairwise_distances

Expand Down Expand Up @@ -36,6 +37,31 @@ class KMERF(IndependenceTest):
``test`` is categorial, use the "classifier" keyword.
ntrees : int, default: 500
The number of trees used in the random forest.
compute_distance : str, callable, or None, default: "euclidean"
A function that computes the distance among the samples for `y`.
Valid strings for ``compute_distance`` are, as defined in
:func:`sklearn.metrics.pairwise_distances`,
- From scikit-learn: [``"euclidean"``, ``"cityblock"``, ``"cosine"``,
``"l1"``, ``"l2"``, ``"manhattan"``] See the documentation for
:mod:`scipy.spatial.distance` for details
on these metrics.
- From scipy.spatial.distance: [``"braycurtis"``, ``"canberra"``,
``"chebyshev"``, ``"correlation"``, ``"dice"``, ``"hamming"``,
``"jaccard"``, ``"kulsinski"``, ``"mahalanobis"``, ``"minkowski"``,
``"rogerstanimoto"``, ``"russellrao"``, ``"seuclidean"``,
``"sokalmichener"``, ``"sokalsneath"``, ``"sqeuclidean"``,
``"yule"``] See the documentation for :mod:`scipy.spatial.distance` for
details on these metrics.
Set to ``None`` or ``"precomputed"`` if ``y`` is already a distance
matrices. To call a custom function, either create the distance matrix
before-hand or create a function of the form ``metric(x, **kwargs)``
where ``x`` is the data matrix for which pairwise distances are
calculated and ``**kwargs`` are extra arguements to send to your custom
function.
distance_kwargs : dict
Arbitrary keyword arguments for ``compute_distance``.
**kwargs
Additional arguments used for the forest (see
:class:`sklearn.ensemble.RandomForestClassifier` or
Expand Down Expand Up @@ -98,12 +124,28 @@ class KMERF(IndependenceTest):
.. footbibliography::
"""

def __init__(self, forest="regressor", ntrees=500, **kwargs):
def __init__(
self,
forest="regressor",
ntrees=500,
compute_distance="euclidean",
distance_kwargs={},
**kwargs
):
self.is_distance = False
self.distance_kwargs = distance_kwargs
if not compute_distance:
self.is_distance = True
self.is_ksamp = False
if "is_ksamp" in kwargs.keys():
self.is_ksamp = True
self.k_sample_transform = kwargs["is_ksamp"]
del kwargs["is_ksamp"]
if forest in FOREST_TYPES.keys():
self.clf = FOREST_TYPES[forest](n_estimators=ntrees, **kwargs)
else:
raise ValueError("Forest must be of type classification or regression")
IndependenceTest.__init__(self)
IndependenceTest.__init__(self, compute_distance=compute_distance)

def statistic(self, x, y):
r"""
Expand All @@ -122,11 +164,20 @@ def statistic(self, x, y):
stat : float
The computed KMERF statistic.
"""
y = y.reshape(-1)
self.clf.fit(x, y)
rf_y = y
if rf_y.shape[1] == 1:
rf_y = rf_y.ravel()
self.clf.fit(x, rf_y)

distx = np.sqrt(1 - sim_matrix(self.clf, x))
y = y.reshape(-1, 1)
disty = pairwise_distances(y, metric="euclidean")
# can use induced kernel if shapes are the same, otherwise
# default to compute_distance
if x.shape[1] == y.shape[1]:
disty = np.sqrt(1 - sim_matrix(self.clf, y))
else:
disty = pairwise_distances(
y, metric=self.compute_distance, **self.distance_kwargs
)
stat = _dcorr(distx, disty, bias=False, is_fast=False)
self.stat = stat

Expand All @@ -135,7 +186,7 @@ def statistic(self, x, y):

return stat

def test(self, x, y, reps=1000, workers=1, random_state=None):
def test(self, x, y, reps=1000, workers=1, auto=True, random_state=None):
r"""
Calculates the KMERF test statistic and p-value.
Expand All @@ -152,6 +203,13 @@ def test(self, x, y, reps=1000, workers=1, random_state=None):
workers : int, default: 1
The number of cores to parallelize the p-value computation over.
Supply ``-1`` to use all cores available to the Process.
auto : bool, default: True
Automatically uses fast approximation when `n` and size of array
is greater than 20. If ``True``, and sample size is greater than 20, then
:class:`hyppo.tools.chi2_approx` will be run. Parameters ``reps`` and
``workers`` are
irrelevant in this case. Otherwise, :class:`hyppo.tools.perm_test` will be
run.
Returns
-------
Expand All @@ -177,9 +235,26 @@ def test(self, x, y, reps=1000, workers=1, random_state=None):
check_input = _CheckInputs(x, y, reps=reps)
x, y = check_input()

stat, pvalue = super(KMERF, self).test(
x, y, reps, workers, is_distsim=False, random_state=random_state
)
if auto and x.shape[0] > 20:
n = x.shape[0]
stat = self.statistic(x, y)
if self.is_ksamp:
xs = self.k_sample_transform([x, x], test_type="rf")
ys = self.k_sample_transform([y, y], test_type="rf")
else:
xs = (x, x)
ys = (y, y)
statx = self.statistic(*xs)
staty = self.statistic(*ys)
pvalue = chi2.sf(stat / np.sqrt(statx * staty) * n + 1, 1)
# pvalue = chi2.sf(stat * n + 1, 1)
self.stat = stat
self.pvalue = pvalue
self.null_dist = None
else:
stat, pvalue = super(KMERF, self).test(
x, y, reps, workers, is_distsim=False, random_state=random_state
)
kmerf_dict = {"feat_importance": self.importances}

return KMERFTestOutput(stat, pvalue, kmerf_dict)
14 changes: 11 additions & 3 deletions hyppo/independence/tests/test_kmerf.py
Expand Up @@ -13,9 +13,17 @@ class TestKMERFStat(object):
@pytest.mark.parametrize(
"sim, obs_stat, obs_pvalue",
[
(linear, 0.253, 1.0), # test linear simulation
(spiral, 0.037, 1.0), # test spiral simulation
(multimodal_independence, -0.0363, 1.0), # test independence simulation
(linear, 1.0, 9.19834440770024e-24), # test linear simulation
(
spiral,
0.1835550720881665,
1.3087565060526607e-05,
), # test spiral simulation
(
multimodal_independence,
-0.0059788,
0.5390103019274002,
), # test independence simulation
],
)
def test_oned(self, sim, obs_stat, obs_pvalue):
Expand Down
7 changes: 4 additions & 3 deletions hyppo/ksample/_utils.py
Expand Up @@ -49,7 +49,8 @@ def check_dim(self):
def _check_nd_ksampletest(self, dims):
if len(set(dims)) > 1:
raise ValueError(
"Shape mismatch, inputs must have shape " "[n, p] and [m, p]."
"Shape mismatch, inputs must have shape "
"[n, p] and [m, p]. Found shapes {}".format(list(set(dims)))
)

def _convert_inputs_float64(self):
Expand Down Expand Up @@ -103,8 +104,8 @@ def k_sample_transform(inputs, test_type="normal"):
raise ValueError("Test cannot be run, the inputs have 0 variance")

if test_type == "rf":
v = np.concatenate(
[np.repeat(i, inputs[i].shape[0]) for i in range(n_inputs)], axis=0
v = np.vstack(
[np.repeat(i, inputs[i].shape[0]).reshape(-1, 1) for i in range(n_inputs)]
)
elif test_type == "normal":
if n_inputs == 2:
Expand Down
5 changes: 4 additions & 1 deletion hyppo/ksample/ksamp.py
Expand Up @@ -184,7 +184,7 @@ def __init__(self, indep_test, compute_distkern="euclidean", bias=False, **kwarg
"hsic": {"bias": bias, "compute_kernel": compute_distkern},
"hhg": {"compute_distance": compute_distkern},
"mgc": {"compute_distance": compute_distkern},
"kmerf": {"forest": "classifier"},
"kmerf": {"forest": "classifier", "is_ksamp": k_sample_transform},
"rv": {},
"cca": {},
}
Expand Down Expand Up @@ -293,10 +293,13 @@ def test(self, *args, reps=1000, workers=1, auto=True, random_state=None):
u, v = k_sample_transform(inputs, test_type="rf")
else:
u, v = k_sample_transform(inputs)
v = v.astype(int)

kwargs = {}
if self.indep_test_name in ["dcorr", "hsic"]:
kwargs = {"auto": auto}
elif self.indep_test_name in ["kmerf"]:
kwargs = {"auto": auto}

return self.indep_test.test(
u, v, reps, workers, **kwargs, random_state=random_state
Expand Down
1 change: 1 addition & 0 deletions hyppo/tools/power.py
Expand Up @@ -24,6 +24,7 @@
"disco": "fast",
"manova": "analytical",
"hotelling": "analytical",
"kmerf": "fast",
}


Expand Down

0 comments on commit f028f44

Please sign in to comment.