Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added spearmanr #86

Merged
merged 2 commits into from Nov 25, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions README.rst
Expand Up @@ -81,6 +81,7 @@ distance metrics by default:

- cosine
- correlation
- spearmanr

**Metrics for binary data**

Expand Down
51 changes: 51 additions & 0 deletions pynndescent/distances.py
Expand Up @@ -365,6 +365,56 @@ def hellinger(x, y):
else:
return np.sqrt(1 - result / np.sqrt(l1_norm_x * l1_norm_y))


@numba.njit()
def rankdata(a, method="average"):
arr = np.ravel(np.asarray(a))
if method == "ordinal":
sorter = arr.argsort(kind="mergesort")
else:
sorter = arr.argsort(kind="quicksort")

inv = np.empty(sorter.size, dtype=np.intp)
inv[sorter] = np.arange(sorter.size)

if method == "ordinal":
return (inv + 1).astype(np.float64)

arr = arr[sorter]
obs = np.ones(arr.size, np.bool_)
obs[1:] = arr[1:] != arr[:-1]
dense = obs.cumsum()[inv]

if method == "dense":
return dense.astype(np.float64)

# cumulative counts of each unique value
nonzero = np.nonzero(obs)[0]
count = np.concatenate((nonzero, np.array([len(obs)], nonzero.dtype)))

if method == "max":
return count[dense].astype(np.float64)

if method == "min":
return (count[dense - 1] + 1).astype(np.float64)

# average method
return 0.5 * (count[dense] + count[dense - 1] + 1)


@numba.njit(fastmath=True)
def spearmanr(x, y):
a = np.column_stack((x, y))

n_vars = a.shape[1]

for i in range(n_vars):
a[:,i] = rankdata(a[:,i])
rs = np.corrcoef(a, rowvar=0)

return rs[1,0]



named_distances = {
# general minkowski distances
Expand All @@ -391,6 +441,7 @@ def hellinger(x, y):
"hellinger": hellinger,
"haversine": haversine,
"braycurtis": bray_curtis,
"spearmanr": spearmanr,
# Binary distances
"hamming": hamming,
"jaccard": jaccard,
Expand Down
12 changes: 11 additions & 1 deletion pynndescent/tests/test_distances.py
@@ -1,7 +1,8 @@
import numpy as np
from numpy.testing import assert_array_equal
import pynndescent.distances as dist
import pynndescent.sparse as spdist
from scipy import sparse
from scipy import sparse, stats
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import BallTree
from sklearn.utils.testing import assert_array_almost_equal
Expand Down Expand Up @@ -385,3 +386,12 @@ def test_haversine():
dist_matrix,
err_msg="Distances don't match " "for metric haversine",
)


def test_spearmanr():
x = np.random.randn(100)
y = np.random.randn(100)

scipy_expected = stats.spearmanr(x, y)
r = dist.spearmanr(x, y)
assert_array_equal(r, scipy_expected.correlation)
144 changes: 144 additions & 0 deletions pynndescent/tests/test_rank.py
@@ -0,0 +1,144 @@
import numpy as np
from numpy.testing import assert_equal, assert_array_equal

from pynndescent.distances import rankdata


def test_empty():
"""rankdata([]) should return an empty array."""
a = np.array([], dtype=int)
r = rankdata(a)
assert_array_equal(r, np.array([], dtype=np.float64))


def test_one():
"""Check rankdata with an array of length 1."""
data = [100]
a = np.array(data, dtype=int)
r = rankdata(a)
assert_array_equal(r, np.array([1.0], dtype=np.float64))


def test_basic():
"""Basic tests of rankdata."""
data = [100, 10, 50]
expected = np.array([3.0, 1.0, 2.0], dtype=np.float64)
a = np.array(data, dtype=int)
r = rankdata(a)
assert_array_equal(r, expected)

data = [40, 10, 30, 10, 50]
expected = np.array([4.0, 1.5, 3.0, 1.5, 5.0], dtype=np.float64)
a = np.array(data, dtype=int)
r = rankdata(a)
assert_array_equal(r, expected)

data = [20, 20, 20, 10, 10, 10]
expected = np.array([5.0, 5.0, 5.0, 2.0, 2.0, 2.0], dtype=np.float64)
a = np.array(data, dtype=int)
r = rankdata(a)
assert_array_equal(r, expected)
# The docstring states explicitly that the argument is flattened.
a2d = a.reshape(2, 3)
r = rankdata(a2d)
assert_array_equal(r, expected)


def test_rankdata_object_string():
min_rank = lambda a: [1 + sum(i < j for i in a) for j in a]
max_rank = lambda a: [sum(i <= j for i in a) for j in a]
ordinal_rank = lambda a: min_rank([(x, i) for i, x in enumerate(a)])

def average_rank(a):
return np.array([(i + j) / 2.0 for i, j in zip(min_rank(a), max_rank(a))])

def dense_rank(a):
b = np.unique(a)
return np.array([1 + sum(i < j for i in b) for j in a])

rankf = dict(
min=min_rank,
max=max_rank,
ordinal=ordinal_rank,
average=average_rank,
dense=dense_rank,
)

def check_ranks(a):
for method in "min", "max", "dense", "ordinal", "average":
out = rankdata(a, method=method)
assert_array_equal(out, rankf[method](a))

check_ranks(np.random.uniform(size=[200]))


def test_large_int():
data = np.array([2 ** 60, 2 ** 60 + 1], dtype=np.uint64)
r = rankdata(data)
assert_array_equal(r, [1.0, 2.0])

data = np.array([2 ** 60, 2 ** 60 + 1], dtype=np.int64)
r = rankdata(data)
assert_array_equal(r, [1.0, 2.0])

data = np.array([2 ** 60, -2 ** 60 + 1], dtype=np.int64)
r = rankdata(data)
assert_array_equal(r, [2.0, 1.0])


def test_big_tie():
for n in [10000, 100000, 1000000]:
data = np.ones(n, dtype=int)
r = rankdata(data)
expected_rank = 0.5 * (n + 1)
assert_array_equal(r, expected_rank * data, "test failed with n=%d" % n)


# fmt: off
_cases = (
# values, method, expected
(np.array([], np.float64), 'average', np.array([], np.float64)),
(np.array([], np.float64), 'min', np.array([], np.float64)),
(np.array([], np.float64), 'max', np.array([], np.float64)),
(np.array([], np.float64), 'dense', np.array([], np.float64)),
(np.array([], np.float64), 'ordinal', np.array([], np.float64)),
#
(np.array([100], np.float64), 'average', np.array([1.0], np.float64)),
(np.array([100], np.float64), 'min', np.array([1.0], np.float64)),
(np.array([100], np.float64), 'max', np.array([1.0], np.float64)),
(np.array([100], np.float64), 'dense', np.array([1.0], np.float64)),
(np.array([100], np.float64), 'ordinal', np.array([1.0], np.float64)),
# #
(np.array([100, 100, 100], np.float64), 'average', np.array([2.0, 2.0, 2.0], np.float64)),
(np.array([100, 100, 100], np.float64), 'min', np.array([1.0, 1.0, 1.0], np.float64)),
(np.array([100, 100, 100], np.float64), 'max', np.array([3.0, 3.0, 3.0], np.float64)),
(np.array([100, 100, 100], np.float64), 'dense', np.array([1.0, 1.0, 1.0], np.float64)),
(np.array([100, 100, 100], np.float64), 'ordinal', np.array([1.0, 2.0, 3.0], np.float64)),
#
(np.array([100, 300, 200], np.float64), 'average', np.array([1.0, 3.0, 2.0], np.float64)),
(np.array([100, 300, 200], np.float64), 'min', np.array([1.0, 3.0, 2.0], np.float64)),
(np.array([100, 300, 200], np.float64), 'max', np.array([1.0, 3.0, 2.0], np.float64)),
(np.array([100, 300, 200], np.float64), 'dense', np.array([1.0, 3.0, 2.0], np.float64)),
(np.array([100, 300, 200], np.float64), 'ordinal', np.array([1.0, 3.0, 2.0], np.float64)),
#
(np.array([100, 200, 300, 200], np.float64), 'average', np.array([1.0, 2.5, 4.0, 2.5], np.float64)),
(np.array([100, 200, 300, 200], np.float64), 'min', np.array([1.0, 2.0, 4.0, 2.0], np.float64)),
(np.array([100, 200, 300, 200], np.float64), 'max', np.array([1.0, 3.0, 4.0, 3.0], np.float64)),
(np.array([100, 200, 300, 200], np.float64), 'dense', np.array([1.0, 2.0, 3.0, 2.0], np.float64)),
(np.array([100, 200, 300, 200], np.float64), 'ordinal', np.array([1.0, 2.0, 4.0, 3.0], np.float64)),
#
(np.array([100, 200, 300, 200, 100], np.float64), 'average', np.array([1.5, 3.5, 5.0, 3.5, 1.5], np.float64)),
(np.array([100, 200, 300, 200, 100], np.float64), 'min', np.array([1.0, 3.0, 5.0, 3.0, 1.0], np.float64)),
(np.array([100, 200, 300, 200, 100], np.float64), 'max', np.array([2.0, 4.0, 5.0, 4.0, 2.0], np.float64)),
(np.array([100, 200, 300, 200, 100], np.float64), 'dense', np.array([1.0, 2.0, 3.0, 2.0, 1.0], np.float64)),
(np.array([100, 200, 300, 200, 100], np.float64), 'ordinal', np.array([1.0, 3.0, 5.0, 4.0, 2.0], np.float64)),
#
(np.array([10] * 30, np.float64), 'ordinal', np.arange(1.0, 31.0, dtype=np.float64)),
)
# fmt: on


def test_cases():
for values, method, expected in _cases:
r = rankdata(values, method=method)
assert_array_equal(r, expected)