Add a low_memory option; clean up some numba warnings

lmcinnes · Sep 9, 2019 · 25f4036 · tomwhite · Sep 11, 2019 · lmcinnes
1 parent 9d75307
commit 25f4036
Show file tree

Hide file tree

Showing 4 changed files with 391 additions and 95 deletions.
diff --git a/umap/distances.py b/umap/distances.py
@@ -1032,15 +1032,15 @@ def count_distance(x, y, poisson_lambda=1.0, normalisation=1.0):
     return result / normalisation
 
 
-@numba.jit()
+@numba.njit()
 def levenshtein(x, y, normalisation=1.0, max_distance=20):
     x_len, y_len = len(x), len(y)
 
     # Opt out of some comparisons
     if abs(x_len - y_len) > max_distance:
         return abs(x_len - y_len) / normalisation
 
-    v0 = np.arange(y_len + 1)
+    v0 = np.arange(y_len + 1).astype(np.float64)
     v1 = np.zeros(y_len + 1)
 
     for i in range(x_len):
@@ -1150,22 +1150,25 @@ def levenshtein(x, y, normalisation=1.0, max_distance=20):
 )
 
 
-@numba.njit()
-def pairwise_special_metric(X, Y=None, metric="hellinger"):
-    special_metric_func = named_distances[metric]
-
+@numba.njit(parallel=True)
+def parallel_special_metric(X, Y=None, metric=hellinger):
     if Y is None:
         result = np.zeros((X.shape[0], X.shape[0]))
 
         for i in range(X.shape[0]):
             for j in range(i + 1, X.shape[0]):
-                result[i, j] = special_metric_func(X[i], X[j])
+                result[i, j] = metric(X[i], X[j])
                 result[j, i] = result[i, j]
     else:
         result = np.zeros((X.shape[0], Y.shape[0]))
 
         for i in range(X.shape[0]):
             for j in range(Y.shape[0]):
-                result[i, j] = special_metric_func(X[i], Y[j])
+                result[i, j] = metric(X[i], Y[j])
 
     return result
+
+
+def pairwise_special_metric(X, Y=None, metric="hellinger"):
+    special_metric_func = named_distances[metric]
+    return parallel_special_metric(X, Y, metric=special_metric_func)
diff --git a/umap/nndescent.py b/umap/nndescent.py
@@ -16,59 +16,121 @@
     new_build_candidates,
     deheap_sort,
 )
+import umap.distances as dist
 
 from umap.rp_tree import search_flat_tree
 
 
-@numba.njit()
-def nn_descent(
+@numba.njit(fastmath=True)
+def init_current_graph(data, dist, dist_args, n_neighbors, rng_state):
+    current_graph = make_heap(data.shape[0], n_neighbors)
+    for i in range(data.shape[0]):
+        indices = rejection_sample(n_neighbors, data.shape[0], rng_state)
+        for j in range(indices.shape[0]):
+            d = dist(data[i], data[indices[j]], *dist_args)
+            heap_push(current_graph, i, d, indices[j], 1)
+            heap_push(current_graph, indices[j], d, i, 1)
+    return current_graph
+
+
+@numba.njit(fastmath=True)
+def init_rp_tree(data, dist, dist_args, current_graph, leaf_array, tried=None):
+    if tried is None:
+        tried = set([(-1, -1)])
+
+    for n in range(leaf_array.shape[0]):
+        for i in range(leaf_array.shape[1]):
+            p = leaf_array[n, i]
+            if p < 0:
+                break
+            for j in range(i + 1, leaf_array.shape[1]):
+                q = leaf_array[n, j]
+                if q < 0:
+                    break
+                if (p, q) in tried:
+                    continue
+                d = dist(data[p], data[q], *dist_args)
+                heap_push(current_graph, p, d, q, 1)
+                tried.add((p, q))
+                if p != q:
+                    heap_push(current_graph, q, d, p, 1)
+                    tried.add((q, p))
+
+
+@numba.njit(fastmath=True)
+def nn_descent_internal_low_memory(
+    current_graph,
     data,
     n_neighbors,
     rng_state,
-    dist,
-    dist_args=(),
     max_candidates=50,
+    dist=dist.euclidean,
+    dist_args=(),
     n_iters=10,
     delta=0.001,
     rho=0.5,
-    rp_tree_init=True,
-    leaf_array=None,
     verbose=False,
 ):
     n_vertices = data.shape[0]
-    tried = set([(-1, -1)])
 
-    current_graph = make_heap(data.shape[0], n_neighbors)
-    for i in range(data.shape[0]):
-        indices = rejection_sample(n_neighbors, data.shape[0], rng_state)
-        for j in range(indices.shape[0]):
-            d = dist(data[i], data[indices[j]], *dist_args)
-            heap_push(current_graph, i, d, indices[j], 1)
-            heap_push(current_graph, indices[j], d, i, 1)
-            tried.add((i, indices[j]))
-            tried.add((indices[j], i))
+    for n in range(n_iters):
+        if verbose:
+            print("\t", n, " / ", n_iters)
 
-    if rp_tree_init:
-        for n in range(leaf_array.shape[0]):
-            for i in range(leaf_array.shape[1]):
-                if leaf_array[n, i] < 0:
-                    break
-                for j in range(i + 1, leaf_array.shape[1]):
-                    if leaf_array[n, j] < 0:
-                        break
-                    if (leaf_array[n, i], leaf_array[n, j]) in tried:
+        (new_candidate_neighbors, old_candidate_neighbors) = new_build_candidates(
+            current_graph, n_vertices, n_neighbors, max_candidates, rng_state, rho
+        )
+
+        c = 0
+        for i in range(n_vertices):
+            for j in range(max_candidates):
+                p = int(new_candidate_neighbors[0, i, j])
+                if p < 0:
+                    continue
+                for k in range(j, max_candidates):
+                    q = int(new_candidate_neighbors[0, i, k])
+                    if q < 0:
                         continue
-                    d = dist(data[leaf_array[n, i]], data[leaf_array[n, j]], *dist_args)
-                    unchecked_heap_push(
-                        current_graph, leaf_array[n, i], d, leaf_array[n, j], 1
-                    )
-                    unchecked_heap_push(
-                        current_graph, leaf_array[n, j], d, leaf_array[n, i], 1
-                    )
-                    tried.add((leaf_array[n, i], leaf_array[n, j]))
-                    tried.add((leaf_array[n, j], leaf_array[n, i]))
+
+                    d = dist(data[p], data[q], *dist_args)
+                    c += heap_push(current_graph, p, d, q, 1)
+                    if p != q:
+                        c += heap_push(current_graph, q, d, p, 1)
+
+                for k in range(max_candidates):
+                    q = int(old_candidate_neighbors[0, i, k])
+                    if q < 0:
+                        continue
+
+                    d = dist(data[p], data[q], *dist_args)
+                    c += heap_push(current_graph, p, d, q, 1)
+                    if p != q:
+                        c += heap_push(current_graph, q, d, p, 1)
+
+        if c <= delta * n_neighbors * data.shape[0]:
+            return
+
+
+@numba.njit(fastmath=True)
+def nn_descent_internal_high_memory(
+    current_graph,
+    data,
+    n_neighbors,
+    rng_state,
+    tried,
+    max_candidates=50,
+    dist=dist.euclidean,
+    dist_args=(),
+    n_iters=10,
+    delta=0.001,
+    rho=0.5,
+    verbose=False,
+):
+    n_vertices = data.shape[0]
 
     for n in range(n_iters):
+        if verbose:
+            print("\t", n, " / ", n_iters)
 
         (new_candidate_neighbors, old_candidate_neighbors) = new_build_candidates(
             current_graph, n_vertices, n_neighbors, max_candidates, rng_state, rho
@@ -87,9 +149,10 @@ def nn_descent(
 
                     d = dist(data[p], data[q], *dist_args)
                     c += unchecked_heap_push(current_graph, p, d, q, 1)
-                    c += unchecked_heap_push(current_graph, q, d, p, 1)
                     tried.add((p, q))
-                    tried.add((q, p))
+                    if p != q:
+                        c += unchecked_heap_push(current_graph, q, d, p, 1)
+                        tried.add((q, p))
 
                 for k in range(max_candidates):
                     q = int(old_candidate_neighbors[0, i, k])
@@ -98,17 +161,80 @@ def nn_descent(
 
                     d = dist(data[p], data[q], *dist_args)
                     c += unchecked_heap_push(current_graph, p, d, q, 1)
-                    c += unchecked_heap_push(current_graph, q, d, p, 1)
                     tried.add((p, q))
-                    tried.add((q, p))
+                    if p != q:
+                        c += unchecked_heap_push(current_graph, q, d, p, 1)
+                        tried.add((q, p))
 
         if c <= delta * n_neighbors * data.shape[0]:
-            break
+            return
+
+
+@numba.njit(fastmath=True)
+def nn_descent(
+    data,
+    n_neighbors,
+    rng_state,
+    max_candidates=50,
+    dist=dist.euclidean,
+    dist_args=(),
+    n_iters=10,
+    delta=0.001,
+    rho=0.5,
+    rp_tree_init=True,
+    leaf_array=None,
+    low_memory=False,
+    verbose=False,
+):
+    tried = set([(-1, -1)])
+
+    current_graph = make_heap(data.shape[0], n_neighbors)
+    for i in range(data.shape[0]):
+        indices = rejection_sample(n_neighbors, data.shape[0], rng_state)
+        for j in range(indices.shape[0]):
+            d = dist(data[i], data[indices[j]], *dist_args)
+            heap_push(current_graph, i, d, indices[j], 1)
+            heap_push(current_graph, indices[j], d, i, 1)
+            tried.add((i, indices[j]))
+            tried.add((indices[j], i))
+
+    if rp_tree_init:
+        init_rp_tree(data, dist, dist_args, current_graph, leaf_array, tried=tried)
+
+    if low_memory:
+        nn_descent_internal_low_memory(
+            current_graph,
+            data,
+            n_neighbors,
+            rng_state,
+            max_candidates=max_candidates,
+            dist=dist,
+            dist_args=dist_args,
+            n_iters=n_iters,
+            delta=delta,
+            rho=rho,
+            verbose=verbose,
+        )
+    else:
+        nn_descent_internal_high_memory(
+            current_graph,
+            data,
+            n_neighbors,
+            rng_state,
+            tried,
+            max_candidates=max_candidates,
+            dist=dist,
+            dist_args=dist_args,
+            n_iters=n_iters,
+            delta=delta,
+            rho=rho,
+            verbose=verbose,
+        )
 
     return deheap_sort(current_graph)
 
 
-@numba.njit(parallel=True)
+@numba.njit()
 def init_from_random(n_neighbors, data, query_points, heap, rng_state, dist, dist_args):
     for i in range(query_points.shape[0]):
         indices = rejection_sample(n_neighbors, data.shape[0], rng_state)
@@ -120,7 +246,7 @@ def init_from_random(n_neighbors, data, query_points, heap, rng_state, dist, dis
     return
 
 
-@numba.njit(parallel=True)
+@numba.njit()
 def init_from_tree(tree, data, query_points, heap, rng_state, dist, dist_args):
     for i in range(query_points.shape[0]):
         indices = search_flat_tree(