Skip to content

Commit

Permalink
Merge pull request #76 from lmcinnes/memory_option
Browse files Browse the repository at this point in the history
Allow a low memory option for index construction
  • Loading branch information
lmcinnes committed Sep 9, 2019
2 parents d35c848 + 679527c commit 1b28d19
Show file tree
Hide file tree
Showing 3 changed files with 361 additions and 47 deletions.
171 changes: 153 additions & 18 deletions pynndescent/pynndescent_.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,8 @@ def init_rp_tree(data, dist, dist_args, current_graph, leaf_array, tried=None):


@numba.njit(fastmath=True)
def nn_descent(
def nn_descent_internal_low_memory(
current_graph,
data,
n_neighbors,
rng_state,
Expand All @@ -167,28 +168,72 @@ def nn_descent(
n_iters=10,
delta=0.001,
rho=0.5,
rp_tree_init=True,
leaf_array=None,
verbose=False,
seed_per_row=False,
):
n_vertices = data.shape[0]
tried = set([(-1, -1)])

current_graph = make_heap(data.shape[0], n_neighbors)
for i in range(data.shape[0]):
if seed_per_row:
seed(rng_state, i)
indices = rejection_sample(n_neighbors, data.shape[0], rng_state)
for j in range(indices.shape[0]):
d = dist(data[i], data[indices[j]], *dist_args)
heap_push(current_graph, i, d, indices[j], 1)
heap_push(current_graph, indices[j], d, i, 1)
tried.add((i, indices[j]))
tried.add((indices[j], i))
for n in range(n_iters):
if verbose:
print("\t", n, " / ", n_iters)

if rp_tree_init:
init_rp_tree(data, dist, dist_args, current_graph, leaf_array, tried=tried)
(new_candidate_neighbors, old_candidate_neighbors) = new_build_candidates(
current_graph,
n_vertices,
n_neighbors,
max_candidates,
rng_state,
rho,
seed_per_row,
)

c = 0
for i in range(n_vertices):
for j in range(max_candidates):
p = int(new_candidate_neighbors[0, i, j])
if p < 0:
continue
for k in range(j, max_candidates):
q = int(new_candidate_neighbors[0, i, k])
if q < 0:
continue

d = dist(data[p], data[q], *dist_args)
c += heap_push(current_graph, p, d, q, 1)
if p != q:
c += heap_push(current_graph, q, d, p, 1)

for k in range(max_candidates):
q = int(old_candidate_neighbors[0, i, k])
if q < 0:
continue

d = dist(data[p], data[q], *dist_args)
c += heap_push(current_graph, p, d, q, 1)
if p != q:
c += heap_push(current_graph, q, d, p, 1)

if c <= delta * n_neighbors * data.shape[0]:
return


@numba.njit(fastmath=True)
def nn_descent_internal_high_memory(
current_graph,
data,
n_neighbors,
rng_state,
tried,
max_candidates=50,
dist=dist.euclidean,
dist_args=(),
n_iters=10,
delta=0.001,
rho=0.5,
verbose=False,
seed_per_row=False,
):
n_vertices = data.shape[0]

for n in range(n_iters):
if verbose:
Expand Down Expand Up @@ -235,7 +280,74 @@ def nn_descent(
tried.add((q, p))

if c <= delta * n_neighbors * data.shape[0]:
break
return


@numba.njit(fastmath=True)
def nn_descent(
data,
n_neighbors,
rng_state,
max_candidates=50,
dist=dist.euclidean,
dist_args=(),
n_iters=10,
delta=0.001,
rho=0.5,
rp_tree_init=True,
leaf_array=None,
low_memory=False,
verbose=False,
seed_per_row=False,
):
tried = set([(-1, -1)])

current_graph = make_heap(data.shape[0], n_neighbors)
for i in range(data.shape[0]):
if seed_per_row:
seed(rng_state, i)
indices = rejection_sample(n_neighbors, data.shape[0], rng_state)
for j in range(indices.shape[0]):
d = dist(data[i], data[indices[j]], *dist_args)
heap_push(current_graph, i, d, indices[j], 1)
heap_push(current_graph, indices[j], d, i, 1)
tried.add((i, indices[j]))
tried.add((indices[j], i))

if rp_tree_init:
init_rp_tree(data, dist, dist_args, current_graph, leaf_array, tried=tried)

if low_memory:
nn_descent_internal_low_memory(
current_graph,
data,
n_neighbors,
rng_state,
max_candidates=max_candidates,
dist=dist,
dist_args=dist_args,
n_iters=n_iters,
delta=delta,
rho=rho,
verbose=verbose,
seed_per_row=seed_per_row,
)
else:
nn_descent_internal_high_memory(
current_graph,
data,
n_neighbors,
rng_state,
tried,
max_candidates=max_candidates,
dist=dist,
dist_args=dist_args,
n_iters=n_iters,
delta=delta,
rho=rho,
verbose=verbose,
seed_per_row=seed_per_row,
)

return deheap_sort(current_graph)

Expand Down Expand Up @@ -429,6 +541,13 @@ class NNDescent(object):
alternative algorithm can be fast for large ``n_neighbors`` values.
To use the alternative algorithm specify ``'alternative'``.
low_memory: boolean (optional, default=False)
Whether to use a lower memory, but more computationally expensive
approach to index construction. This defaults to false as for most
cases it speeds index construction, but if you are having issues
with excessive memory use for your dataset consider setting this
to True.
max_candidates: int (optional, default=20)
Internally each "self-join" keeps a maximum number of candidates (
nearest neighbors and reverse nearest neighbors) to be considered.
Expand Down Expand Up @@ -477,6 +596,7 @@ def __init__(
tree_init=True,
random_state=np.random,
algorithm="standard",
low_memory=False,
max_candidates=20,
n_iters=None,
delta=0.001,
Expand All @@ -498,6 +618,7 @@ def __init__(
self.leaf_size = leaf_size
self.prune_level = pruning_level
self.max_candidates = max_candidates
self.low_memory = low_memory
self.n_iters = n_iters
self.delta = delta
self.rho = rho
Expand Down Expand Up @@ -637,6 +758,8 @@ def __init__(
self.n_neighbors,
self.rng_state,
self.max_candidates,
rho=self.rho,
low_memory=self.low_memory,
sparse_dist=self._distance_func,
dist_args=self._dist_args,
n_iters=self.n_iters,
Expand All @@ -662,6 +785,7 @@ def __init__(
self.n_iters,
self.delta,
self.rho,
low_memory=self.low_memory,
rp_tree_init=True,
leaf_array=leaf_array,
verbose=verbose,
Expand Down Expand Up @@ -898,6 +1022,14 @@ class PyNNDescentTransformer(BaseEstimator, TransformerMixin):
alternative algorithm can be fast for large ``n_neighbors`` values.
To use the alternative algorithm specify ``'alternative'``.
low_memory: boolean (optional, default=False)
Whether to use a lower memory, but more computationally expensive
approach to index construction. This defaults to false as for most
cases it speeds index construction, but if you are having issues
with excessive memory use for your dataset consider setting this
to True.
max_candidates: int (optional, default=20)
Internally each "self-join" keeps a maximum number of candidates (
nearest neighbors and reverse nearest neighbors) to be considered.
Expand Down Expand Up @@ -950,6 +1082,7 @@ def __init__(
tree_init=True,
random_state=np.random,
algorithm="standard",
low_memory=False,
max_candidates=20,
n_iters=None,
early_termination_value=0.001,
Expand All @@ -967,6 +1100,7 @@ def __init__(
self.tree_init = tree_init
self.random_state = random_state
self.algorithm = algorithm
self.low_memory = low_memory
self.max_candidates = max_candidates
self.n_iters = n_iters
self.early_termination_value = early_termination_value
Expand Down Expand Up @@ -1005,6 +1139,7 @@ def fit(self, X):
self.tree_init,
self.random_state,
self.algorithm,
self.low_memory,
self.max_candidates,
self.n_iters,
self.early_termination_value,
Expand Down
Loading

0 comments on commit 1b28d19

Please sign in to comment.