Skip to content

Commit

Permalink
Merge pull request #225 from dkobak/init-fixes
Browse files Browse the repository at this point in the history
Improve PCA and spectral inits
  • Loading branch information
pavlin-policar committed Feb 2, 2023
2 parents db111ee + 7661b59 commit 4cae0be
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 15 deletions.
75 changes: 69 additions & 6 deletions openTSNE/initialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,14 @@
from openTSNE import utils


def rescale(x, inplace=False):
def rescale(x, inplace=False, target_std=1e-4):
"""Rescale an embedding so optimization will not have convergence issues.
Parameters
----------
x: np.ndarray
inplace: bool
target_std: float
Returns
-------
Expand All @@ -23,7 +24,34 @@ def rescale(x, inplace=False):
if not inplace:
x = np.array(x, copy=True)

x /= np.std(x[:, 0]) * 10000
x /= np.std(x[:, 0]) / target_std

return x


def jitter(x, inplace=False, scale=0.01, random_state=None):
"""Add jitter with small standard deviation to avoid numerical problems
when the points overlap exactly.
Parameters
----------
x: np.ndarray
inplace: bool
scale: float
random_state: int or np.random.RandomState
Returns
-------
np.ndarray
A jittered version of ``x``.
"""
if not inplace:
x = np.array(x, copy=True)

target_std = np.std(x[:, 0]) * scale
random_state = check_random_state(random_state)
x += random_state.normal(0, target_std, x.shape)

return x

Expand Down Expand Up @@ -59,7 +87,14 @@ def random(n_samples, n_components=2, random_state=None, verbose=False):
return np.ascontiguousarray(embedding)


def pca(X, n_components=2, svd_solver="auto", random_state=None, verbose=False):
def pca(
X,
n_components=2,
svd_solver="auto",
random_state=None,
verbose=False,
add_jitter=True,
):
"""Initialize an embedding using the top principal components.
Parameters
Expand All @@ -80,6 +115,11 @@ def pca(X, n_components=2, svd_solver="auto", random_state=None, verbose=False):
number generator is the RandomState instance used by `np.random`.
verbose: bool
add_jitter: bool
If True, jitter with small standard deviation is added to the
initialization to prevent points overlapping exactly, which may lead to
numerical issues during optimization.
Returns
-------
Expand All @@ -93,14 +133,25 @@ def pca(X, n_components=2, svd_solver="auto", random_state=None, verbose=False):
n_components=n_components, svd_solver=svd_solver, random_state=random_state
)
embedding = pca_.fit_transform(X)

rescale(embedding, inplace=True)
if add_jitter:
jitter(embedding, inplace=True, random_state=random_state)

timer.__exit__()

return np.ascontiguousarray(embedding)


def spectral(A, n_components=2, tol=1e-4, max_iter=None, random_state=None, verbose=False):
def spectral(
A,
n_components=2,
tol=1e-4,
max_iter=None,
random_state=None,
verbose=False,
add_jitter=True,
):
"""Initialize an embedding using the spectral embedding of the KNN graph.
Specifically, we initialize data points by computing the diffusion map on
Expand All @@ -122,7 +173,15 @@ def spectral(A, n_components=2, tol=1e-4, max_iter=None, random_state=None, verb
See scipy.sparse.linalg.eigsh documentation.
random_state: Any
Unused, but kept for consistency between initialization schemes.
If the value is an int, random_state is the seed used by the random
number generator. If the value is a RandomState instance, then it will
be used as the random number generator. If the value is None, the random
number generator is the RandomState instance used by `np.random`.
add_jitter: bool
If True, jitter with small standard deviation is added to the
initialization to prevent points overlapping exactly, which may lead to
numerical issues during optimization.
verbose: bool
Expand All @@ -143,7 +202,9 @@ def spectral(A, n_components=2, tol=1e-4, max_iter=None, random_state=None, verb

# Find leading eigenvectors
k = n_components + 1
v0 = np.ones(A.shape[0]) / np.sqrt(A.shape[0])
# v0 initializatoin is taken from sklearn.utils._arpack._init_arpack_v0()
random_state = check_random_state(random_state)
v0 = random_state.uniform(-1, 1, A.shape[0])
eigvals, eigvecs = sp.linalg.eigsh(
A, M=D, k=k, tol=tol, maxiter=max_iter, which="LM", v0=v0
)
Expand All @@ -158,6 +219,8 @@ def spectral(A, n_components=2, tol=1e-4, max_iter=None, random_state=None, verb
embedding = eigvecs[:, 1:]

rescale(embedding, inplace=True)
if add_jitter:
jitter(embedding, inplace=True, random_state=random_state)

timer.__exit__()

Expand Down
20 changes: 20 additions & 0 deletions tests/test_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.manifold import SpectralEmbedding

TSNE = partial(openTSNE.TSNE, neighbors="exact", negative_gradient_method="bh")

Expand Down Expand Up @@ -298,3 +299,22 @@ def test_iris(self):

np.testing.assert_almost_equal(embedding1, embedding2)


class TestSpectralInitializationCorrectness(unittest.TestCase):
def test_spectral_agreement_with_sklearn(self):
# Generate some random data and stretch it, to give it some structure
np.random.seed(42)
x = np.random.randn(100, 20)
x[:,0] *= 5

# Perform spectral embedding via sklearn and via openTSNE
P = openTSNE.affinity.PerplexityBasedNN(x).P
embedding1 = openTSNE.initialization.spectral(P, tol=0, add_jitter=False)
embedding2 = SpectralEmbedding(affinity='precomputed').fit_transform(P)

np.testing.assert_almost_equal(
np.abs(np.corrcoef(embedding1[:,0], embedding2[:,0])[0,1]), 1
)
np.testing.assert_almost_equal(
np.abs(np.corrcoef(embedding1[:,1], embedding2[:,1])[0,1]), 1
)
4 changes: 2 additions & 2 deletions tests/test_different_usages.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,10 +191,10 @@ def test_affinity_with_precomputed_neighbors(self):

class TestUsageExplicitOptimizeCalls(TestUsage):
def test_explicit_optimize_calls(self):
embedding1 = TSNE().fit(self.x)
embedding1 = TSNE(random_state=42).fit(self.x)

A = affinity.PerplexityBasedNN(self.x)
I = initialization.pca(self.x)
I = initialization.pca(self.x, random_state=42)
embedding2 = TSNEEmbedding(I, A)
embedding2 = embedding2.optimize(n_iter=25, exaggeration=12)
embedding2 = embedding2.optimize(n_iter=50)
Expand Down
22 changes: 15 additions & 7 deletions tests/test_tsne.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,9 +466,9 @@ def test_parameter_init_takes_precendence_over_constructor_init(self):

def test_pca_init_with_only_affinities_passed(self):
aff = affinity.PerplexityBasedNN(self.x, 5, method="exact")
desired_init = initialization.spectral(aff.P)
desired_init = initialization.spectral(aff.P, random_state=42)
embedding = TSNE(
early_exaggeration_iter=0, n_iter=0, initialization="pca"
early_exaggeration_iter=0, n_iter=0, initialization="pca", random_state=42
).fit(affinities=aff)
np.testing.assert_array_equal(embedding, desired_init)

Expand Down Expand Up @@ -935,18 +935,26 @@ def test_precomputed_dist_matrix_via_affinities_uses_spectral_init(self):
d = squareform(pdist(x))

aff = affinity.PerplexityBasedNN(d, metric="precomputed")
desired_init = initialization.spectral(aff.P)
embedding = TSNE(early_exaggeration_iter=0, n_iter=0).fit(affinities=aff)
desired_init = initialization.spectral(aff.P, random_state=42)
embedding = TSNE(
early_exaggeration_iter=0,
n_iter=0,
random_state=42,
).fit(affinities=aff)
np.testing.assert_array_equal(embedding, desired_init)

def test_precomputed_dist_matrix_via_tsne_interface_uses_spectral_init(self):
x = np.random.normal(0, 1, (200, 5))
d = squareform(pdist(x))

aff = affinity.PerplexityBasedNN(d, metric="precomputed")
desired_init = initialization.spectral(aff.P)
embedding = TSNE(metric="precomputed", early_exaggeration_iter=0, n_iter=0) \
.fit(d)
desired_init = initialization.spectral(aff.P, random_state=42)
embedding = TSNE(
metric="precomputed",
early_exaggeration_iter=0,
n_iter=0,
random_state=42,
).fit(d)
np.testing.assert_array_equal(embedding, desired_init)

def test_precomputed_dist_matrix_doesnt_override_valid_inits(self):
Expand Down

0 comments on commit 4cae0be

Please sign in to comment.