Merge pull request #524 from jc-healy/dev0.5_disconnect_distance

Dev0.5 disconnect distance (Fixes Issue #523)
lmcinnes · Jan 9, 2021 · 6264820 · 6264820
2 parents 29e11fc + 3d52451
commit 6264820
Show file tree

Hide file tree

Showing 9 changed files with 338 additions and 48 deletions.
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/.idea/codeStyles/codeStyleConfig.xml b/.idea/codeStyles/codeStyleConfig.xml
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
diff --git a/doc/faq.rst b/doc/faq.rst
@@ -241,6 +241,45 @@ Where can I learn more?
 - While PCA is ubiquitous, you may `look <https://github.com/snakers4/playing_with_vae>`_
   at this example comparing PCA / UMAP / VAEs;
 
+How UMAP can go wrong
+---------------------
+
+One way UMAP can go wrong is the introduction of data points that are maximally far apart
+from all other points in your data set.  In other words, a points nearest neighbour is maximally
+far from it.  A common example of this could be a point which shares no features in common
+with any other point under a Jaccard distance or a point whose nearest neighbour is np.inf from
+it under a continuous distance function.  In both these cases UMAPs assumption of all points
+lying on a connected manifold can lead us astray.  From this points perspective all other points
+are equally valid nearest neighbours so its k-nearest neighbour query will return a random
+selection of neighbours all at this maximal distance.  Next we will normalize this distance by
+applying our UMAP kernel which says that a point should be maximally similar to it's nearest neighbour.
+Since all k-nearest neighbours are identically far apart they will all be considered maximally
+similar by our point in question.  When we try to embed our data into a low dimensional space
+our optimization will attempt to pull all these randomly selected points together.  Add a
+sufficiently large number of these points and our entire space gets pulled together destroying
+any of the structure we had hoped to identify.
+
+To circumvent this problem we've added a disconnection_distance parameter to UMAP which will cut
+any edge with a distance greater than the value passed in.  This parameter defaults to ``None``.
+When set to ``None`` the disconnection_distance will be set to the maximal value for any of our
+supported bounded metrics and otherwise set to np.inf.  Removing these edges from the UMAP graph
+will disconnect our manifold and cause these points to start where they are initialized and get pushed
+away from all other points via the our optimization.
+
+If a user has a good understanding of their distance metric they can set this value by hand to prevent
+data in particularly sparse regions of their space from becoming connected to their manifold.
+
+If vertices in your graph are disconnected a warning message will be thrown.  At that point a user can
+make use of the umap.utils.disconnected_vertices() function to identify the disconnected points.
+This can be used either for filtering and retraining a new UMAP model or simple to bed used as a
+filter for visualization purposes as seen below.
+
+.. code:: python3
+
+    umap_model = umap.UMAP().fit(data)
+    disconnected_points = umap.utils.disconnected_vertices(umap_model)
+    umap.plot.points(umap_model, subset_points=~disconnected_points)
+
 Successful use-cases
 --------------------
 

diff --git a/doc/release_notes.rst b/doc/release_notes.rst
@@ -13,6 +13,8 @@ What's new in 0.5
 * UMAP now supports an ``update`` method to add new data and retrain.
 * Various performance improvements and bug fixes
 * Additional plotting support, including text searching in interactive plots
+* disconnection_distance allows disconnecting points beyond a certain distance from our manifold. (Thanks to John Healy)
+
 
 What's new in 0.4
 -----------------

diff --git a/umap/spectral.py b/umap/spectral.py
@@ -96,11 +96,13 @@ def component_layout(
 
         if metric in SPECIAL_METRICS:
             distance_matrix = pairwise_special_metric(
-                component_centroids, metric=metric
+                component_centroids, metric=metric, kwds=metric_kwds,
             )
         elif metric in SPARSE_SPECIAL_METRICS:
             distance_matrix = pairwise_special_metric(
-                component_centroids, metric=SPARSE_SPECIAL_METRICS[metric],
+                component_centroids,
+                metric=SPARSE_SPECIAL_METRICS[metric],
+                kwds=metric_kwds,
             )
         else:
             if callable(metric) and scipy.sparse.isspmatrix(data):

diff --git a/umap/tests/test_umap_ops.py b/umap/tests/test_umap_ops.py
@@ -13,6 +13,11 @@
 from umap.spectral import component_layout
 import numpy as np
 import scipy.sparse
+import pytest
+import warnings
+from umap.distances import pairwise_special_metric
+from umap.utils import disconnected_vertices
+from scipy.sparse import csr_matrix
 
 # Transform isn't stable under batching; hard to opt out of this.
 # @SkipTest
@@ -94,6 +99,72 @@ def test_multi_component_layout_precomputed():
 
     assert_less(error, 15.0, msg="Multi component embedding to far astray")
 
+@pytest.mark.parametrize("num_isolates", [1, 5])
+@pytest.mark.parametrize("metric", ["jaccard", "hellinger", "cosine"])
+@pytest.mark.parametrize("force_approximation", [True, False])
+def test_disconnected_data(num_isolates, metric, force_approximation):
+    disconnected_data = np.random.choice(
+        a=[False, True], size=(10, 20), p=[0.66, 1 - 0.66]
+    )
+    # Add some disconnected data for the corner case test
+    disconnected_data = np.vstack(
+        [disconnected_data, np.zeros((num_isolates, 20), dtype="bool")]
+    )
+    new_columns = np.zeros((num_isolates + 10, num_isolates), dtype="bool")
+    for i in range(num_isolates):
+        new_columns[10 + i, i] = True
+    disconnected_data = np.hstack([disconnected_data, new_columns])
+
+    with warnings.catch_warnings(record=True) as w:
+        model = UMAP(
+            n_neighbors=3,
+            metric=metric,
+            force_approximation_algorithm=force_approximation,
+        ).fit(disconnected_data)
+        assert len(w) >= 1  # at least one warning should be raised here
+        # we can't guarantee the order that the warnings will be raised in so check them all.
+        flag = 0
+        if num_isolates == 1:
+            warning_contains = "A few of your vertices"
+        elif num_isolates > 1:
+            warning_contains = "A large number of your vertices"
+        for i in range(len(w)):
+            flag += warning_contains in str(w[i].message)
+        assert flag == 1
+        # Check that the first isolate has no edges in our umap.graph_
+        isolated_vertices = disconnected_vertices(model)
+        assert isolated_vertices[10] == True
+        number_of_nan = np.sum(np.isnan(model.embedding_[isolated_vertices]))
+        assert number_of_nan >= num_isolates*model.n_components
+
+@pytest.mark.parametrize("num_isolates", [1])
+@pytest.mark.parametrize("sparse", [True, False])
+def test_disconnected_data_precomputed(num_isolates, sparse):
+    disconnected_data = np.random.choice(
+        a=[False, True], size=(10, 20), p=[0.66, 1 - 0.66]
+    )
+    # Add some disconnected data for the corner case test
+    disconnected_data = np.vstack(
+        [disconnected_data, np.zeros((num_isolates, 20), dtype="bool")]
+    )
+    new_columns = np.zeros((num_isolates + 10, num_isolates), dtype="bool")
+    for i in range(num_isolates):
+        new_columns[10 + i, i] = True
+    disconnected_data = np.hstack([disconnected_data, new_columns])
+    dmat = pairwise_special_metric(disconnected_data)
+    if sparse:
+        dmat = csr_matrix(dmat)
+    model = UMAP(n_neighbors=3, metric="precomputed", disconnection_distance=1).fit(
+        dmat
+    )
+
+    # Check that the first isolate has no edges in our umap.graph_
+    isolated_vertices = disconnected_vertices(model)
+    assert isolated_vertices[10] == True
+    number_of_nan = np.sum(np.isnan(model.embedding_[isolated_vertices]))
+    assert number_of_nan >= num_isolates * model.n_components
+
+
 # ---------------
 # Umap Transform
 # --------------