Skip to content

Commit

Permalink
Add a low_memory option; clean up some numba warnings
Browse files Browse the repository at this point in the history
  • Loading branch information
lmcinnes committed Sep 9, 2019
1 parent 9d75307 commit 25f4036
Show file tree
Hide file tree
Showing 4 changed files with 391 additions and 95 deletions.
19 changes: 11 additions & 8 deletions umap/distances.py
Original file line number Diff line number Diff line change
Expand Up @@ -1032,15 +1032,15 @@ def count_distance(x, y, poisson_lambda=1.0, normalisation=1.0):
return result / normalisation


@numba.jit()
@numba.njit()
def levenshtein(x, y, normalisation=1.0, max_distance=20):
x_len, y_len = len(x), len(y)

# Opt out of some comparisons
if abs(x_len - y_len) > max_distance:
return abs(x_len - y_len) / normalisation

v0 = np.arange(y_len + 1)
v0 = np.arange(y_len + 1).astype(np.float64)
v1 = np.zeros(y_len + 1)

for i in range(x_len):
Expand Down Expand Up @@ -1150,22 +1150,25 @@ def levenshtein(x, y, normalisation=1.0, max_distance=20):
)


@numba.njit()
def pairwise_special_metric(X, Y=None, metric="hellinger"):
special_metric_func = named_distances[metric]

@numba.njit(parallel=True)
def parallel_special_metric(X, Y=None, metric=hellinger):
if Y is None:
result = np.zeros((X.shape[0], X.shape[0]))

for i in range(X.shape[0]):
for j in range(i + 1, X.shape[0]):
result[i, j] = special_metric_func(X[i], X[j])
result[i, j] = metric(X[i], X[j])
result[j, i] = result[i, j]
else:
result = np.zeros((X.shape[0], Y.shape[0]))

for i in range(X.shape[0]):
for j in range(Y.shape[0]):
result[i, j] = special_metric_func(X[i], Y[j])
result[i, j] = metric(X[i], Y[j])

return result


def pairwise_special_metric(X, Y=None, metric="hellinger"):
special_metric_func = named_distances[metric]
return parallel_special_metric(X, Y, metric=special_metric_func)
208 changes: 167 additions & 41 deletions umap/nndescent.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,59 +16,121 @@
new_build_candidates,
deheap_sort,
)
import umap.distances as dist

from umap.rp_tree import search_flat_tree


@numba.njit()
def nn_descent(
@numba.njit(fastmath=True)
def init_current_graph(data, dist, dist_args, n_neighbors, rng_state):
current_graph = make_heap(data.shape[0], n_neighbors)
for i in range(data.shape[0]):
indices = rejection_sample(n_neighbors, data.shape[0], rng_state)
for j in range(indices.shape[0]):
d = dist(data[i], data[indices[j]], *dist_args)
heap_push(current_graph, i, d, indices[j], 1)
heap_push(current_graph, indices[j], d, i, 1)
return current_graph


@numba.njit(fastmath=True)
def init_rp_tree(data, dist, dist_args, current_graph, leaf_array, tried=None):
if tried is None:
tried = set([(-1, -1)])

for n in range(leaf_array.shape[0]):
for i in range(leaf_array.shape[1]):
p = leaf_array[n, i]
if p < 0:
break
for j in range(i + 1, leaf_array.shape[1]):
q = leaf_array[n, j]
if q < 0:
break
if (p, q) in tried:
continue
d = dist(data[p], data[q], *dist_args)
heap_push(current_graph, p, d, q, 1)
tried.add((p, q))
if p != q:
heap_push(current_graph, q, d, p, 1)
tried.add((q, p))


@numba.njit(fastmath=True)
def nn_descent_internal_low_memory(
current_graph,
data,
n_neighbors,
rng_state,
dist,
dist_args=(),
max_candidates=50,
dist=dist.euclidean,
dist_args=(),

This comment has been minimized.

Copy link
@tomwhite

tomwhite Sep 11, 2019

Collaborator

@lmcinnes it looks like switching the order of arguments here has broken tests. I think it went from

nn_descent(..., dist, dist_args, max_candidates, ...)

to

nn_descent(..., max_candidates, dist, dist_args, ...)

This comment has been minimized.

Copy link
@lmcinnes

lmcinnes Sep 11, 2019

Author Owner

Sorry about that - fixing it now.

n_iters=10,
delta=0.001,
rho=0.5,
rp_tree_init=True,
leaf_array=None,
verbose=False,
):
n_vertices = data.shape[0]
tried = set([(-1, -1)])

current_graph = make_heap(data.shape[0], n_neighbors)
for i in range(data.shape[0]):
indices = rejection_sample(n_neighbors, data.shape[0], rng_state)
for j in range(indices.shape[0]):
d = dist(data[i], data[indices[j]], *dist_args)
heap_push(current_graph, i, d, indices[j], 1)
heap_push(current_graph, indices[j], d, i, 1)
tried.add((i, indices[j]))
tried.add((indices[j], i))
for n in range(n_iters):
if verbose:
print("\t", n, " / ", n_iters)

if rp_tree_init:
for n in range(leaf_array.shape[0]):
for i in range(leaf_array.shape[1]):
if leaf_array[n, i] < 0:
break
for j in range(i + 1, leaf_array.shape[1]):
if leaf_array[n, j] < 0:
break
if (leaf_array[n, i], leaf_array[n, j]) in tried:
(new_candidate_neighbors, old_candidate_neighbors) = new_build_candidates(
current_graph, n_vertices, n_neighbors, max_candidates, rng_state, rho
)

c = 0
for i in range(n_vertices):
for j in range(max_candidates):
p = int(new_candidate_neighbors[0, i, j])
if p < 0:
continue
for k in range(j, max_candidates):
q = int(new_candidate_neighbors[0, i, k])
if q < 0:
continue
d = dist(data[leaf_array[n, i]], data[leaf_array[n, j]], *dist_args)
unchecked_heap_push(
current_graph, leaf_array[n, i], d, leaf_array[n, j], 1
)
unchecked_heap_push(
current_graph, leaf_array[n, j], d, leaf_array[n, i], 1
)
tried.add((leaf_array[n, i], leaf_array[n, j]))
tried.add((leaf_array[n, j], leaf_array[n, i]))

d = dist(data[p], data[q], *dist_args)
c += heap_push(current_graph, p, d, q, 1)
if p != q:
c += heap_push(current_graph, q, d, p, 1)

for k in range(max_candidates):
q = int(old_candidate_neighbors[0, i, k])
if q < 0:
continue

d = dist(data[p], data[q], *dist_args)
c += heap_push(current_graph, p, d, q, 1)
if p != q:
c += heap_push(current_graph, q, d, p, 1)

if c <= delta * n_neighbors * data.shape[0]:
return


@numba.njit(fastmath=True)
def nn_descent_internal_high_memory(
current_graph,
data,
n_neighbors,
rng_state,
tried,
max_candidates=50,
dist=dist.euclidean,
dist_args=(),
n_iters=10,
delta=0.001,
rho=0.5,
verbose=False,
):
n_vertices = data.shape[0]

for n in range(n_iters):
if verbose:
print("\t", n, " / ", n_iters)

(new_candidate_neighbors, old_candidate_neighbors) = new_build_candidates(
current_graph, n_vertices, n_neighbors, max_candidates, rng_state, rho
Expand All @@ -87,9 +149,10 @@ def nn_descent(

d = dist(data[p], data[q], *dist_args)
c += unchecked_heap_push(current_graph, p, d, q, 1)
c += unchecked_heap_push(current_graph, q, d, p, 1)
tried.add((p, q))
tried.add((q, p))
if p != q:
c += unchecked_heap_push(current_graph, q, d, p, 1)
tried.add((q, p))

for k in range(max_candidates):
q = int(old_candidate_neighbors[0, i, k])
Expand All @@ -98,17 +161,80 @@ def nn_descent(

d = dist(data[p], data[q], *dist_args)
c += unchecked_heap_push(current_graph, p, d, q, 1)
c += unchecked_heap_push(current_graph, q, d, p, 1)
tried.add((p, q))
tried.add((q, p))
if p != q:
c += unchecked_heap_push(current_graph, q, d, p, 1)
tried.add((q, p))

if c <= delta * n_neighbors * data.shape[0]:
break
return


@numba.njit(fastmath=True)
def nn_descent(
data,
n_neighbors,
rng_state,
max_candidates=50,
dist=dist.euclidean,
dist_args=(),
n_iters=10,
delta=0.001,
rho=0.5,
rp_tree_init=True,
leaf_array=None,
low_memory=False,
verbose=False,
):
tried = set([(-1, -1)])

current_graph = make_heap(data.shape[0], n_neighbors)
for i in range(data.shape[0]):
indices = rejection_sample(n_neighbors, data.shape[0], rng_state)
for j in range(indices.shape[0]):
d = dist(data[i], data[indices[j]], *dist_args)
heap_push(current_graph, i, d, indices[j], 1)
heap_push(current_graph, indices[j], d, i, 1)
tried.add((i, indices[j]))
tried.add((indices[j], i))

if rp_tree_init:
init_rp_tree(data, dist, dist_args, current_graph, leaf_array, tried=tried)

if low_memory:
nn_descent_internal_low_memory(
current_graph,
data,
n_neighbors,
rng_state,
max_candidates=max_candidates,
dist=dist,
dist_args=dist_args,
n_iters=n_iters,
delta=delta,
rho=rho,
verbose=verbose,
)
else:
nn_descent_internal_high_memory(
current_graph,
data,
n_neighbors,
rng_state,
tried,
max_candidates=max_candidates,
dist=dist,
dist_args=dist_args,
n_iters=n_iters,
delta=delta,
rho=rho,
verbose=verbose,
)

return deheap_sort(current_graph)


@numba.njit(parallel=True)
@numba.njit()
def init_from_random(n_neighbors, data, query_points, heap, rng_state, dist, dist_args):
for i in range(query_points.shape[0]):
indices = rejection_sample(n_neighbors, data.shape[0], rng_state)
Expand All @@ -120,7 +246,7 @@ def init_from_random(n_neighbors, data, query_points, heap, rng_state, dist, dis
return


@numba.njit(parallel=True)
@numba.njit()
def init_from_tree(tree, data, query_points, heap, rng_state, dist, dist_args):
for i in range(query_points.shape[0]):
indices = search_flat_tree(
Expand Down

0 comments on commit 25f4036

Please sign in to comment.