Skip to content

Commit

Permalink
Merge pull request #86 from sleighsoft/spearmanr
Browse files Browse the repository at this point in the history
Added spearmanr
  • Loading branch information
lmcinnes committed Nov 25, 2019
2 parents 4f36af4 + 1dce2f7 commit 4bd07f6
Show file tree
Hide file tree
Showing 4 changed files with 207 additions and 1 deletion.
1 change: 1 addition & 0 deletions README.rst
Expand Up @@ -81,6 +81,7 @@ distance metrics by default:

- cosine
- correlation
- spearmanr

**Metrics for binary data**

Expand Down
51 changes: 51 additions & 0 deletions pynndescent/distances.py
Expand Up @@ -365,6 +365,56 @@ def hellinger(x, y):
else:
return np.sqrt(1 - result / np.sqrt(l1_norm_x * l1_norm_y))


@numba.njit()
def rankdata(a, method="average"):
arr = np.ravel(np.asarray(a))
if method == "ordinal":
sorter = arr.argsort(kind="mergesort")
else:
sorter = arr.argsort(kind="quicksort")

inv = np.empty(sorter.size, dtype=np.intp)
inv[sorter] = np.arange(sorter.size)

if method == "ordinal":
return (inv + 1).astype(np.float64)

arr = arr[sorter]
obs = np.ones(arr.size, np.bool_)
obs[1:] = arr[1:] != arr[:-1]
dense = obs.cumsum()[inv]

if method == "dense":
return dense.astype(np.float64)

# cumulative counts of each unique value
nonzero = np.nonzero(obs)[0]
count = np.concatenate((nonzero, np.array([len(obs)], nonzero.dtype)))

if method == "max":
return count[dense].astype(np.float64)

if method == "min":
return (count[dense - 1] + 1).astype(np.float64)

# average method
return 0.5 * (count[dense] + count[dense - 1] + 1)


@numba.njit(fastmath=True)
def spearmanr(x, y):
a = np.column_stack((x, y))

n_vars = a.shape[1]

for i in range(n_vars):
a[:,i] = rankdata(a[:,i])
rs = np.corrcoef(a, rowvar=0)

return rs[1,0]



named_distances = {
# general minkowski distances
Expand All @@ -391,6 +441,7 @@ def hellinger(x, y):
"hellinger": hellinger,
"haversine": haversine,
"braycurtis": bray_curtis,
"spearmanr": spearmanr,
# Binary distances
"hamming": hamming,
"jaccard": jaccard,
Expand Down
12 changes: 11 additions & 1 deletion pynndescent/tests/test_distances.py
@@ -1,7 +1,8 @@
import numpy as np
from numpy.testing import assert_array_equal
import pynndescent.distances as dist
import pynndescent.sparse as spdist
from scipy import sparse
from scipy import sparse, stats
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import BallTree
from sklearn.utils.testing import assert_array_almost_equal
Expand Down Expand Up @@ -385,3 +386,12 @@ def test_haversine():
dist_matrix,
err_msg="Distances don't match " "for metric haversine",
)


def test_spearmanr():
x = np.random.randn(100)
y = np.random.randn(100)

scipy_expected = stats.spearmanr(x, y)
r = dist.spearmanr(x, y)
assert_array_equal(r, scipy_expected.correlation)
144 changes: 144 additions & 0 deletions pynndescent/tests/test_rank.py
@@ -0,0 +1,144 @@
import numpy as np
from numpy.testing import assert_equal, assert_array_equal

from pynndescent.distances import rankdata


def test_empty():
"""rankdata([]) should return an empty array."""
a = np.array([], dtype=int)
r = rankdata(a)
assert_array_equal(r, np.array([], dtype=np.float64))


def test_one():
"""Check rankdata with an array of length 1."""
data = [100]
a = np.array(data, dtype=int)
r = rankdata(a)
assert_array_equal(r, np.array([1.0], dtype=np.float64))


def test_basic():
"""Basic tests of rankdata."""
data = [100, 10, 50]
expected = np.array([3.0, 1.0, 2.0], dtype=np.float64)
a = np.array(data, dtype=int)
r = rankdata(a)
assert_array_equal(r, expected)

data = [40, 10, 30, 10, 50]
expected = np.array([4.0, 1.5, 3.0, 1.5, 5.0], dtype=np.float64)
a = np.array(data, dtype=int)
r = rankdata(a)
assert_array_equal(r, expected)

data = [20, 20, 20, 10, 10, 10]
expected = np.array([5.0, 5.0, 5.0, 2.0, 2.0, 2.0], dtype=np.float64)
a = np.array(data, dtype=int)
r = rankdata(a)
assert_array_equal(r, expected)
# The docstring states explicitly that the argument is flattened.
a2d = a.reshape(2, 3)
r = rankdata(a2d)
assert_array_equal(r, expected)


def test_rankdata_object_string():
min_rank = lambda a: [1 + sum(i < j for i in a) for j in a]
max_rank = lambda a: [sum(i <= j for i in a) for j in a]
ordinal_rank = lambda a: min_rank([(x, i) for i, x in enumerate(a)])

def average_rank(a):
return np.array([(i + j) / 2.0 for i, j in zip(min_rank(a), max_rank(a))])

def dense_rank(a):
b = np.unique(a)
return np.array([1 + sum(i < j for i in b) for j in a])

rankf = dict(
min=min_rank,
max=max_rank,
ordinal=ordinal_rank,
average=average_rank,
dense=dense_rank,
)

def check_ranks(a):
for method in "min", "max", "dense", "ordinal", "average":
out = rankdata(a, method=method)
assert_array_equal(out, rankf[method](a))

check_ranks(np.random.uniform(size=[200]))


def test_large_int():
data = np.array([2 ** 60, 2 ** 60 + 1], dtype=np.uint64)
r = rankdata(data)
assert_array_equal(r, [1.0, 2.0])

data = np.array([2 ** 60, 2 ** 60 + 1], dtype=np.int64)
r = rankdata(data)
assert_array_equal(r, [1.0, 2.0])

data = np.array([2 ** 60, -2 ** 60 + 1], dtype=np.int64)
r = rankdata(data)
assert_array_equal(r, [2.0, 1.0])


def test_big_tie():
for n in [10000, 100000, 1000000]:
data = np.ones(n, dtype=int)
r = rankdata(data)
expected_rank = 0.5 * (n + 1)
assert_array_equal(r, expected_rank * data, "test failed with n=%d" % n)


# fmt: off
_cases = (
# values, method, expected
(np.array([], np.float64), 'average', np.array([], np.float64)),
(np.array([], np.float64), 'min', np.array([], np.float64)),
(np.array([], np.float64), 'max', np.array([], np.float64)),
(np.array([], np.float64), 'dense', np.array([], np.float64)),
(np.array([], np.float64), 'ordinal', np.array([], np.float64)),
#
(np.array([100], np.float64), 'average', np.array([1.0], np.float64)),
(np.array([100], np.float64), 'min', np.array([1.0], np.float64)),
(np.array([100], np.float64), 'max', np.array([1.0], np.float64)),
(np.array([100], np.float64), 'dense', np.array([1.0], np.float64)),
(np.array([100], np.float64), 'ordinal', np.array([1.0], np.float64)),
# #
(np.array([100, 100, 100], np.float64), 'average', np.array([2.0, 2.0, 2.0], np.float64)),
(np.array([100, 100, 100], np.float64), 'min', np.array([1.0, 1.0, 1.0], np.float64)),
(np.array([100, 100, 100], np.float64), 'max', np.array([3.0, 3.0, 3.0], np.float64)),
(np.array([100, 100, 100], np.float64), 'dense', np.array([1.0, 1.0, 1.0], np.float64)),
(np.array([100, 100, 100], np.float64), 'ordinal', np.array([1.0, 2.0, 3.0], np.float64)),
#
(np.array([100, 300, 200], np.float64), 'average', np.array([1.0, 3.0, 2.0], np.float64)),
(np.array([100, 300, 200], np.float64), 'min', np.array([1.0, 3.0, 2.0], np.float64)),
(np.array([100, 300, 200], np.float64), 'max', np.array([1.0, 3.0, 2.0], np.float64)),
(np.array([100, 300, 200], np.float64), 'dense', np.array([1.0, 3.0, 2.0], np.float64)),
(np.array([100, 300, 200], np.float64), 'ordinal', np.array([1.0, 3.0, 2.0], np.float64)),
#
(np.array([100, 200, 300, 200], np.float64), 'average', np.array([1.0, 2.5, 4.0, 2.5], np.float64)),
(np.array([100, 200, 300, 200], np.float64), 'min', np.array([1.0, 2.0, 4.0, 2.0], np.float64)),
(np.array([100, 200, 300, 200], np.float64), 'max', np.array([1.0, 3.0, 4.0, 3.0], np.float64)),
(np.array([100, 200, 300, 200], np.float64), 'dense', np.array([1.0, 2.0, 3.0, 2.0], np.float64)),
(np.array([100, 200, 300, 200], np.float64), 'ordinal', np.array([1.0, 2.0, 4.0, 3.0], np.float64)),
#
(np.array([100, 200, 300, 200, 100], np.float64), 'average', np.array([1.5, 3.5, 5.0, 3.5, 1.5], np.float64)),
(np.array([100, 200, 300, 200, 100], np.float64), 'min', np.array([1.0, 3.0, 5.0, 3.0, 1.0], np.float64)),
(np.array([100, 200, 300, 200, 100], np.float64), 'max', np.array([2.0, 4.0, 5.0, 4.0, 2.0], np.float64)),
(np.array([100, 200, 300, 200, 100], np.float64), 'dense', np.array([1.0, 2.0, 3.0, 2.0, 1.0], np.float64)),
(np.array([100, 200, 300, 200, 100], np.float64), 'ordinal', np.array([1.0, 3.0, 5.0, 4.0, 2.0], np.float64)),
#
(np.array([10] * 30, np.float64), 'ordinal', np.arange(1.0, 31.0, dtype=np.float64)),
)
# fmt: on


def test_cases():
for values, method, expected in _cases:
r = rankdata(values, method=method)
assert_array_equal(r, expected)

0 comments on commit 4bd07f6

Please sign in to comment.