Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Fast kmerf #369

Merged
merged 13 commits into from May 24, 2023
4 changes: 3 additions & 1 deletion hyppo/independence/_utils.py
Expand Up @@ -50,7 +50,9 @@ def _check_nd_indeptest(self):
ny, _ = self.y.shape
if nx != ny:
raise ValueError(
"Shape mismatch, x and y must have shape " "[n, p] and [n, q]."
"Shape mismatch, x and y must have shape [n, p] and [n, q], found shape {} and {}".format(
self.x.shape, self.y.shape
)
)

def _check_min_samples(self):
Expand Down
95 changes: 85 additions & 10 deletions hyppo/independence/kmerf.py
@@ -1,6 +1,7 @@
from typing import NamedTuple

import numpy as np
from scipy.stats.distributions import chi2
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import pairwise_distances

Expand Down Expand Up @@ -36,6 +37,31 @@ class KMERF(IndependenceTest):
``test`` is categorial, use the "classifier" keyword.
ntrees : int, default: 500
The number of trees used in the random forest.
compute_distance : str, callable, or None, default: "euclidean"
A function that computes the distance among the samples for `y`.
Valid strings for ``compute_distance`` are, as defined in
:func:`sklearn.metrics.pairwise_distances`,

- From scikit-learn: [``"euclidean"``, ``"cityblock"``, ``"cosine"``,
``"l1"``, ``"l2"``, ``"manhattan"``] See the documentation for
:mod:`scipy.spatial.distance` for details
on these metrics.
- From scipy.spatial.distance: [``"braycurtis"``, ``"canberra"``,
``"chebyshev"``, ``"correlation"``, ``"dice"``, ``"hamming"``,
``"jaccard"``, ``"kulsinski"``, ``"mahalanobis"``, ``"minkowski"``,
``"rogerstanimoto"``, ``"russellrao"``, ``"seuclidean"``,
``"sokalmichener"``, ``"sokalsneath"``, ``"sqeuclidean"``,
``"yule"``] See the documentation for :mod:`scipy.spatial.distance` for
details on these metrics.

Set to ``None`` or ``"precomputed"`` if ``y`` is already a distance
matrices. To call a custom function, either create the distance matrix
before-hand or create a function of the form ``metric(x, **kwargs)``
where ``x`` is the data matrix for which pairwise distances are
calculated and ``**kwargs`` are extra arguements to send to your custom
function.
distance_kwargs : dict
Arbitrary keyword arguments for ``compute_distance``.
**kwargs
Additional arguments used for the forest (see
:class:`sklearn.ensemble.RandomForestClassifier` or
Expand Down Expand Up @@ -98,12 +124,28 @@ class KMERF(IndependenceTest):
.. footbibliography::
"""

def __init__(self, forest="regressor", ntrees=500, **kwargs):
def __init__(
self,
forest="regressor",
ntrees=500,
compute_distance="euclidean",
distance_kwargs={},
**kwargs
):
self.is_distance = False
self.distance_kwargs = distance_kwargs
if not compute_distance:
self.is_distance = True
self.is_ksamp = False
if "is_ksamp" in kwargs.keys():
self.is_ksamp = True
self.k_sample_transform = kwargs["is_ksamp"]
del kwargs["is_ksamp"]
if forest in FOREST_TYPES.keys():
self.clf = FOREST_TYPES[forest](n_estimators=ntrees, **kwargs)
else:
raise ValueError("Forest must be of type classification or regression")
IndependenceTest.__init__(self)
IndependenceTest.__init__(self, compute_distance=compute_distance)

def statistic(self, x, y):
r"""
Expand All @@ -122,11 +164,20 @@ def statistic(self, x, y):
stat : float
The computed KMERF statistic.
"""
y = y.reshape(-1)
self.clf.fit(x, y)
rf_y = y
if rf_y.shape[1] == 1:
rf_y = rf_y.ravel()
self.clf.fit(x, rf_y)

distx = np.sqrt(1 - sim_matrix(self.clf, x))
y = y.reshape(-1, 1)
disty = pairwise_distances(y, metric="euclidean")
# can use induced kernel if shapes are the same, otherwise
# default to compute_distance
if x.shape[1] == y.shape[1]:
disty = np.sqrt(1 - sim_matrix(self.clf, y))
else:
disty = pairwise_distances(
y, metric=self.compute_distance, **self.distance_kwargs
)
stat = _dcorr(distx, disty, bias=False, is_fast=False)
self.stat = stat

Expand All @@ -135,7 +186,7 @@ def statistic(self, x, y):

return stat

def test(self, x, y, reps=1000, workers=1, random_state=None):
def test(self, x, y, reps=1000, workers=1, auto=True, random_state=None):
r"""
Calculates the KMERF test statistic and p-value.

Expand All @@ -152,6 +203,13 @@ def test(self, x, y, reps=1000, workers=1, random_state=None):
workers : int, default: 1
The number of cores to parallelize the p-value computation over.
Supply ``-1`` to use all cores available to the Process.
auto : bool, default: True
Automatically uses fast approximation when `n` and size of array
is greater than 20. If ``True``, and sample size is greater than 20, then
:class:`hyppo.tools.chi2_approx` will be run. Parameters ``reps`` and
``workers`` are
irrelevant in this case. Otherwise, :class:`hyppo.tools.perm_test` will be
run.

Returns
-------
Expand All @@ -177,9 +235,26 @@ def test(self, x, y, reps=1000, workers=1, random_state=None):
check_input = _CheckInputs(x, y, reps=reps)
x, y = check_input()

stat, pvalue = super(KMERF, self).test(
x, y, reps, workers, is_distsim=False, random_state=random_state
)
if auto and x.shape[0] > 20:
n = x.shape[0]
stat = self.statistic(x, y)
if self.is_ksamp:
xs = self.k_sample_transform([x, x], test_type="rf")
ys = self.k_sample_transform([y, y], test_type="rf")
else:
xs = (x, x)
ys = (y, y)
statx = self.statistic(*xs)
staty = self.statistic(*ys)
pvalue = chi2.sf(stat / np.sqrt(statx * staty) * n + 1, 1)
# pvalue = chi2.sf(stat * n + 1, 1)
self.stat = stat
self.pvalue = pvalue
self.null_dist = None
else:
stat, pvalue = super(KMERF, self).test(
x, y, reps, workers, is_distsim=False, random_state=random_state
)
kmerf_dict = {"feat_importance": self.importances}

return KMERFTestOutput(stat, pvalue, kmerf_dict)
14 changes: 11 additions & 3 deletions hyppo/independence/tests/test_kmerf.py
Expand Up @@ -13,9 +13,17 @@ class TestKMERFStat(object):
@pytest.mark.parametrize(
"sim, obs_stat, obs_pvalue",
[
(linear, 0.253, 1.0), # test linear simulation
(spiral, 0.037, 1.0), # test spiral simulation
(multimodal_independence, -0.0363, 1.0), # test independence simulation
(linear, 1.0, 9.19834440770024e-24), # test linear simulation
(
spiral,
0.1835550720881665,
1.3087565060526607e-05,
), # test spiral simulation
(
multimodal_independence,
-0.0059788,
0.5390103019274002,
), # test independence simulation
],
)
def test_oned(self, sim, obs_stat, obs_pvalue):
Expand Down
7 changes: 4 additions & 3 deletions hyppo/ksample/_utils.py
Expand Up @@ -49,7 +49,8 @@ def check_dim(self):
def _check_nd_ksampletest(self, dims):
if len(set(dims)) > 1:
raise ValueError(
"Shape mismatch, inputs must have shape " "[n, p] and [m, p]."
"Shape mismatch, inputs must have shape "
"[n, p] and [m, p]. Found shapes {}".format(list(set(dims)))
)

def _convert_inputs_float64(self):
Expand Down Expand Up @@ -103,8 +104,8 @@ def k_sample_transform(inputs, test_type="normal"):
raise ValueError("Test cannot be run, the inputs have 0 variance")

if test_type == "rf":
v = np.concatenate(
[np.repeat(i, inputs[i].shape[0]) for i in range(n_inputs)], axis=0
v = np.vstack(
[np.repeat(i, inputs[i].shape[0]).reshape(-1, 1) for i in range(n_inputs)]
)
elif test_type == "normal":
if n_inputs == 2:
Expand Down
5 changes: 4 additions & 1 deletion hyppo/ksample/ksamp.py
Expand Up @@ -184,7 +184,7 @@ def __init__(self, indep_test, compute_distkern="euclidean", bias=False, **kwarg
"hsic": {"bias": bias, "compute_kernel": compute_distkern},
"hhg": {"compute_distance": compute_distkern},
"mgc": {"compute_distance": compute_distkern},
"kmerf": {"forest": "classifier"},
"kmerf": {"forest": "classifier", "is_ksamp": k_sample_transform},
"rv": {},
"cca": {},
}
Expand Down Expand Up @@ -293,10 +293,13 @@ def test(self, *args, reps=1000, workers=1, auto=True, random_state=None):
u, v = k_sample_transform(inputs, test_type="rf")
else:
u, v = k_sample_transform(inputs)
v = v.astype(int)

kwargs = {}
if self.indep_test_name in ["dcorr", "hsic"]:
kwargs = {"auto": auto}
elif self.indep_test_name in ["kmerf"]:
kwargs = {"auto": auto}

return self.indep_test.test(
u, v, reps, workers, **kwargs, random_state=random_state
Expand Down
1 change: 1 addition & 0 deletions hyppo/tools/power.py
Expand Up @@ -24,6 +24,7 @@
"disco": "fast",
"manova": "analytical",
"hotelling": "analytical",
"kmerf": "fast",
}


Expand Down