Skip to content

Commit

Permalink
Merge pull request #294 from tomwhite/embedding_optimization_0.4dev
Browse files Browse the repository at this point in the history
Break out inner loop of optimize_layout_euclidean so numba can parallelize it 0.4dev
  • Loading branch information
lmcinnes committed Sep 12, 2019
2 parents 9a3ac7d + 7ed2b97 commit cc0eaa7
Show file tree
Hide file tree
Showing 2 changed files with 117 additions and 61 deletions.
160 changes: 103 additions & 57 deletions umap/layouts.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,82 @@ def rdist(x, y):
return result


@numba.njit(fastmath=True, parallel=True)
def _optimize_layout_euclidean_single_epoch(
head_embedding,
tail_embedding,
head,
tail,
n_vertices,
epochs_per_sample,
a,
b,
rng_state,
gamma,
dim,
move_other,
alpha,
epochs_per_negative_sample,
epoch_of_next_negative_sample,
epoch_of_next_sample,
n,
):
for i in numba.prange(epochs_per_sample.shape[0]):
if epoch_of_next_sample[i] <= n:
j = head[i]
k = tail[i]

current = head_embedding[j]
other = tail_embedding[k]

dist_squared = rdist(current, other)

if dist_squared > 0.0:
grad_coeff = -2.0 * a * b * pow(dist_squared, b - 1.0)
grad_coeff /= a * pow(dist_squared, b) + 1.0
else:
grad_coeff = 0.0

for d in range(dim):
grad_d = clip(grad_coeff * (current[d] - other[d]))
current[d] += grad_d * alpha
if move_other:
other[d] += -grad_d * alpha

epoch_of_next_sample[i] += epochs_per_sample[i]

n_neg_samples = int(
(n - epoch_of_next_negative_sample[i]) / epochs_per_negative_sample[i]
)

for p in range(n_neg_samples):
k = tau_rand_int(rng_state) % n_vertices

other = tail_embedding[k]

dist_squared = rdist(current, other)

if dist_squared > 0.0:
grad_coeff = 2.0 * gamma * b
grad_coeff /= (0.001 + dist_squared) * (
a * pow(dist_squared, b) + 1
)
elif j == k:
continue
else:
grad_coeff = 0.0

for d in range(dim):
if grad_coeff > 0.0:
grad_d = clip(grad_coeff * (current[d] - other[d]))
else:
grad_d = 4.0
current[d] += grad_d * alpha

epoch_of_next_negative_sample[i] += (
n_neg_samples * epochs_per_negative_sample[i]
)


def optimize_layout_euclidean(
head_embedding,
tail_embedding,
Expand All @@ -61,6 +136,7 @@ def optimize_layout_euclidean(
gamma=1.0,
initial_alpha=1.0,
negative_sample_rate=5.0,
parallel=False,
verbose=False,
):
"""Improve an embedding using stochastic gradient descent to minimize the
Expand Down Expand Up @@ -100,6 +176,10 @@ def optimize_layout_euclidean(
Initial learning rate for the SGD.
negative_sample_rate: int (optional, default 5)
Number of negative samples to use per positive sample.
parallel: bool (optional, default False)
Whether to run the computation using numba parallel.
Running in parallel is non-deterministic, and is not used
if a random seed has been set, to ensure reproducibility.
verbose: bool (optional, default False)
Whether to report information on the current progress of the algorithm.
Returns
Expand All @@ -116,63 +196,29 @@ def optimize_layout_euclidean(
epoch_of_next_negative_sample = epochs_per_negative_sample.copy()
epoch_of_next_sample = epochs_per_sample.copy()

optimize_fn = numba.njit(
_optimize_layout_euclidean_single_epoch, fastmath=True, parallel=parallel
)
for n in range(n_epochs):
for i in range(epochs_per_sample.shape[0]):
if epoch_of_next_sample[i] <= n:
j = head[i]
k = tail[i]

current = head_embedding[j]
other = tail_embedding[k]

dist_squared = rdist(current, other)

if dist_squared > 0.0:
grad_coeff = -2.0 * a * b * pow(dist_squared, b - 1.0)
grad_coeff /= a * pow(dist_squared, b) + 1.0
else:
grad_coeff = 0.0

for d in range(dim):
grad_d = clip(grad_coeff * (current[d] - other[d]))
current[d] += grad_d * alpha
if move_other:
other[d] += -grad_d * alpha

epoch_of_next_sample[i] += epochs_per_sample[i]

n_neg_samples = int(
(n - epoch_of_next_negative_sample[i])
/ epochs_per_negative_sample[i]
)

for p in range(n_neg_samples):
k = tau_rand_int(rng_state) % n_vertices

other = tail_embedding[k]

dist_squared = rdist(current, other)

if dist_squared > 0.0:
grad_coeff = 2.0 * gamma * b
grad_coeff /= (0.001 + dist_squared) * (
a * pow(dist_squared, b) + 1
)
elif j == k:
continue
else:
grad_coeff = 0.0

for d in range(dim):
if grad_coeff > 0.0:
grad_d = clip(grad_coeff * (current[d] - other[d]))
else:
grad_d = 4.0
current[d] += grad_d * alpha

epoch_of_next_negative_sample[i] += (
n_neg_samples * epochs_per_negative_sample[i]
)
optimize_fn(
head_embedding,
tail_embedding,
head,
tail,
n_vertices,
epochs_per_sample,
a,
b,
rng_state,
gamma,
dim,
move_other,
alpha,
epochs_per_negative_sample,
epoch_of_next_negative_sample,
epoch_of_next_sample,
n,
)

alpha = initial_alpha * (1.0 - (float(n) / float(n_epochs)))

Expand Down
18 changes: 14 additions & 4 deletions umap/umap_.py
Original file line number Diff line number Diff line change
Expand Up @@ -876,10 +876,11 @@ def simplicial_set_embedding(
random_state,
metric,
metric_kwds,
output_metric,
output_metric_kwds,
euclidean_output,
verbose,
output_metric=dist.named_distances_with_gradients["euclidean"],
output_metric_kwds={},
euclidean_output=True,
parallel=False,
verbose=False,
):
"""Perform a fuzzy simplicial set embedding, using a specified
initialisation method and then minimizing the fuzzy set cross entropy
Expand Down Expand Up @@ -950,6 +951,11 @@ def simplicial_set_embedding(
euclidean_output: bool
Whether to use the faster code specialised for euclidean output metrics
parallel: bool (optional, default False)
Whether to run the computation using numba parallel.
Running in parallel is non-deterministic, and is not used
if a random seed has been set, to ensure reproducibility.
verbose: bool (optional, default False)
Whether to report information on the current progress of the algorithm.
Expand Down Expand Up @@ -1034,6 +1040,7 @@ def simplicial_set_embedding(
gamma,
initial_alpha,
negative_sample_rate,
parallel=parallel,
verbose=verbose,
)
else:
Expand Down Expand Up @@ -1703,6 +1710,7 @@ def fit(self, X, y=None):
self._output_distance_func,
self._output_metric_kwds,
self.output_metric in ("euclidean", "l2"),
self.random_state is None,
self.verbose,
)

Expand Down Expand Up @@ -1905,6 +1913,7 @@ def transform(self, X):
self.repulsion_strength,
self._initial_alpha / 4.0,
self.negative_sample_rate,
self.random_state is None,
verbose=self.verbose,
)
else:
Expand Down Expand Up @@ -2428,6 +2437,7 @@ def is_discrete_metric(metric_data):
self._output_distance_func,
self._output_metric_kwds,
self.output_metric in ("euclidean", "l2"),
self.random_state is None,
self.verbose,
)

Expand Down

0 comments on commit cc0eaa7

Please sign in to comment.