Merge pull request #34 from microsoft/cvec-weights

Use weighted degree for diagonal augmentation
microsoft · Mar 12, 2020 · dba00c2 · dba00c2
2 parents 7d4af06 + 1616c8d
commit dba00c2
Show file tree

Hide file tree

Showing 8 changed files with 109 additions and 43 deletions.
diff --git a/docs/release_notes.md b/docs/release_notes.md
@@ -1,5 +1,6 @@
 # Release Notes
-
+## 0.1.2
+- Rename `self_loop_augmentation` to `diagonal_augmentation` and use weighted degree to perform calculation instead of degree only.
 ## 0.1.1
 - [Issue 29](https://github.com/microsoft/topologic/issues/29) Fixed bug in `topologic.io.from_dataset` where an empty networkx graph object (Graph, DiGraph, etc) was being treated as if no networkx Graph object were provided at all.
 - Added `is_digraph` parameter to `topologic.io.from_file`. This parameter defaults to False for original behavior. Setting it to True will create a networkx DiGraph object instead.

diff --git a/tests/embedding/test_adjacency_spectral_embedding.py b/tests/embedding/test_adjacency_spectral_embedding.py
@@ -21,9 +21,9 @@ def test_adjacency_embedding(self):
         self.assertIsInstance(matrix, np.ndarray)
         self.assertIsInstance(labels, list)
         self.assertEqual(2, matrix.ndim)
-        expected_matrix = np.array([[0.43445175],
-                                    [1.14794954],
-                                    [0.8689035]])
+        expected_matrix = np.array([[0.385095],
+                                    [1.140718],
+                                    [0.926595]])
         expected_label = ['a', 'b', 'c']
         np.testing.assert_allclose(expected_matrix, matrix, rtol=1e-6)
         self.assertListEqual(expected_label, labels)
@@ -40,11 +40,11 @@ def test_adjacency_embedding_elbowcut_none(self):
         self.assertIsInstance(matrix, np.ndarray)
         self.assertIsInstance(labels, list)
         self.assertEqual(2, matrix.ndim)
-        expected_matrix = np.array([[0.43445175, 0.29790657],
-                                    [1.14794954, -0.56372701],
-                                    [0.8689035, 0.59581314]])
+        expected_matrix = np.array([[0.385095, -0.351718],
+                                    [1.140718, 0.552094],
+                                    [0.926595, -0.5335]])
         expected_label = ['a', 'b', 'c']
-        np.testing.assert_allclose(expected_matrix, matrix, rtol=1e-6)
+        np.testing.assert_allclose(expected_matrix, matrix, rtol=1e-5)
         self.assertListEqual(expected_label, labels)
 
     def test_adjacency_embedding_gpickle(self):

diff --git a/tests/embedding/test_laplacian_spectral_embedding.py b/tests/embedding/test_laplacian_spectral_embedding.py
@@ -1,10 +1,14 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
+import pickle
+import sys
 import unittest
+
 import networkx as nx
 import numpy as np
-import pickle
+import pytest
+
 from topologic.embedding import laplacian_embedding
 
 
@@ -21,15 +25,18 @@ def test_laplacian_embedding(self):
         self.assertIsInstance(matrix, np.ndarray)
         self.assertIsInstance(labels, list)
         self.assertEqual(2, matrix.ndim)
-        expected_matrix = np.array([[0.44095855],
-                                    [0.70710678],
-                                    [0.5527708]])
+        expected_matrix = np.array([[0.408248],
+                                    [0.707107],
+                                    [0.577350]])
         expected_label = ['a', 'b', 'c']
-        np.testing.assert_allclose(expected_matrix, matrix)
+        np.testing.assert_allclose(expected_matrix, matrix, rtol=1e-5)
         self.assertListEqual(expected_label, labels)
 
     def test_laplacian_embedding_elbowcut_none(self):
-        graph = nx.Graph([('a', 'b', {'weight': 1.0}), ('b', 'c', {'weight': 2.0})])
+        if sys.platform.startswith('darwin'):
+            pytest.skip('Test not supported on Mac OS')
+
+        graph = nx.Graph([('a', 'b', {'weight': 2.0}), ('b', 'c', {'weight': 2.0})])
         result = laplacian_embedding(
             graph,
             elbow_cut=None,
@@ -40,11 +47,11 @@ def test_laplacian_embedding_elbowcut_none(self):
         self.assertIsInstance(matrix, np.ndarray)
         self.assertIsInstance(labels, list)
         self.assertEqual(2, matrix.ndim)
-        expected_matrix = np.array([[0.44095855, 0.51959271],
-                                    [0.70710678, -0.06490658],
-                                    [0.5527708, -0.33146281]])
+        expected_matrix = np.array([[5.000000e-01, 4.714045e-01],
+                                    [7.071068e-01, -3.333333e-01],
+                                    [5.000000e-01, -1.425006e-16]])
         expected_label = ['a', 'b', 'c']
-        np.testing.assert_allclose(expected_matrix, matrix)
+        np.testing.assert_allclose(expected_matrix, matrix, rtol=1e-5)
         self.assertListEqual(expected_label, labels)
 
     def test_laplacian_embedding_gpickle(self):

diff --git a/tests/test_graph_augmentation.py b/tests/test_graph_augmentation.py
@@ -3,21 +3,22 @@
 
 import unittest
 import networkx as nx
-from topologic import self_loop_augmentation
+import numpy as np
+from topologic import diagonal_augmentation
 
 
 class TestDiagonalAugmentation(unittest.TestCase):
     def test_diag_aug_for_a_non_graph_raises_exception(self):
         with self.assertRaises(TypeError) as raised:
-            self_loop_augmentation('not a graph')
+            diagonal_augmentation('not a graph')
 
         self.assertTrue('must be a networkx.Graph' in str(raised.exception))
 
     def test_diag_aug_for_2_nodes_self_loops(self):
         graph = nx.Graph([('a', 'b'), ('b', 'c'), ('a', 'a'), ('b', 'b'), ('c', 'c')])
         expected_set = {('a', 'a', 0.5), ('a', 'b', None), ('b', 'b', 1.0), ('b', 'c', None), ('c', 'c', 0.5)}
 
-        augmented = self_loop_augmentation(graph)
+        augmented = diagonal_augmentation(graph)
         ranked_edge_set = set(augmented.edges(data='weight'))
 
         self.assertEqual(augmented, graph)
@@ -27,8 +28,60 @@ def test_diag_aug_for_2_nodes_no_self_loops(self):
         graph = nx.Graph([('a', 'b'), ('b', 'c')])
         expected_set = {('a', 'a', 0.5), ('a', 'b', None), ('b', 'b', 1.0), ('b', 'c', None), ('c', 'c', 0.5)}
 
-        augmented = self_loop_augmentation(graph)
+        augmented = diagonal_augmentation(graph)
         ranked_edge_set = set(augmented.edges(data='weight'))
 
         self.assertEqual(augmented, graph)
         self.assertEqual(expected_set, ranked_edge_set)
+
+    def test_undirected_uses_weighted_degree(self):
+        start_adajacency = np.array(
+            [
+                [0, 1, 1, 0, 0],
+                [1, 0, 0, 2, 1],
+                [1, 0, 0, 1, 1],
+                [0, 2, 1, 0, 0],
+                [0, 1, 1, 0, 0],
+            ]
+        )
+        expected = [
+            [.5, 1, 1, 0, 0],
+            [1, 1, 0, 2, 1],
+            [1, 0, .75, 1, 1],
+            [0, 2, 1, .75, 0],
+            [0, 1, 1, 0, .5]
+        ]
+
+        g = diagonal_augmentation(nx.Graph(start_adajacency))
+        augmented_adjacency = nx.adj_matrix(g).todense()
+
+        np.testing.assert_array_equal(
+            augmented_adjacency,
+            expected
+        )
+
+    def test_directed_averages_in_out_edge_weights(self):
+        start_adjacency = np.array(
+            [
+                [0, 1, -1, 0, 0],
+                [0, 0, 0, 2, 1],
+                [1, 0, 0, 1, 1],
+                [0, 2, 0, 0, 0],
+                [0, 0, 1, 0, 0],
+            ]
+        )
+        expected = [
+            [.125, 1, -1, 0, 0],
+            [0, .75, 0, 2, 1],
+            [1, 0, .375, 1, 1],
+            [0, 2, 0, .625, 0],
+            [0, 0, 1, 0, .375],
+        ]
+
+        g = diagonal_augmentation(nx.DiGraph(start_adjacency))
+        augmented_adjacency = nx.adj_matrix(g).todense()
+
+        np.testing.assert_array_equal(
+            augmented_adjacency,
+            expected
+        )
diff --git a/topologic/__init__.py b/topologic/__init__.py
@@ -11,7 +11,7 @@
     largest_connected_component, \
     connected_components_generator
 from .partitioned_graph import PartitionedGraph
-from .graph_augmentation import rank_edges, self_loop_augmentation
+from .graph_augmentation import rank_edges, diagonal_augmentation
 
 from . import similarity
 from . import io
@@ -27,6 +27,6 @@
     'largest_connected_component',
     'number_connected_components',
     'PartitionedGraph',
-    'self_loop_augmentation',
+    'diagonal_augmentation',
     'UnweightedGraphError'
 ]
diff --git a/topologic/embedding/omnibus_embedding.py b/topologic/embedding/omnibus_embedding.py
@@ -14,7 +14,7 @@
 from ..connected_components import largest_connected_component
 from ..embedding import EmbeddingMethod
 from ..graph_augmentation import rank_edges, \
-    self_loop_augmentation
+    diagonal_augmentation
 
 
 def omnibus_embedding(
@@ -94,7 +94,7 @@ def omnibus_embedding(
 
     starting_graph = largest_connected_component(graphs[0])
     starting_graph = rank_edges(starting_graph)
-    starting_graph = self_loop_augmentation(starting_graph)
+    starting_graph = diagonal_augmentation(starting_graph)
 
     previous_graph = starting_graph
     count = 1
@@ -104,7 +104,7 @@ def omnibus_embedding(
         count = count + 1
         current_graph = largest_connected_component(graph)
         current_graph = rank_edges(current_graph)
-        current_graph = self_loop_augmentation(current_graph)
+        current_graph = diagonal_augmentation(current_graph)
 
         pairwise_graphs = [previous_graph] + [current_graph]
         pairwise_graphs_reduced = _reduce_to_common_nodes(pairwise_graphs)

diff --git a/topologic/embedding/spectral_embedding.py b/topologic/embedding/spectral_embedding.py
@@ -9,15 +9,15 @@
 
 from .elbow_finder import find_elbows
 from ..graph_augmentation import rank_edges, \
-    self_loop_augmentation
+    diagonal_augmentation
 
 
 def _create_augmented_adjacency_matrix(weight_column, working_graph):
     logging.debug("rank edges")
     ranked_graph = rank_edges(working_graph, weight_column)
 
     logging.debug("add self loops and sensible weights")
-    augmented_graph = self_loop_augmentation(ranked_graph, weight_column)
+    augmented_graph = diagonal_augmentation(ranked_graph, weight_column)
 
     sorted_vertices = sorted(augmented_graph.nodes())
     graph_matrix = nx.to_scipy_sparse_matrix(augmented_graph, nodelist=sorted_vertices)

diff --git a/topologic/graph_augmentation.py b/topologic/graph_augmentation.py
@@ -1,31 +1,31 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
+from typing import Union
+
 import networkx as nx
 import numpy as np
 from scipy.stats import rankdata
 
 from . import assertions
 
 
-def self_loop_augmentation(
-        graph: nx.classes.graph.Graph,
+def diagonal_augmentation(
+        graph: Union[nx.Graph, nx.DiGraph],
         weight_column: str = 'weight'
 ) -> nx.Graph:
     """
-    Generates a self loop for each vertex in the graph with a generated weight for each vertex that is the ratio
-    between its degree in the graph and the total number of *other* vertices in the graph, excluding the original
-    self loop.
-
-    This should be used prior to Spectral Embedding techniques to ensure that there is a reasonable value for
-    each vertex as it will appear in an adjacency matrix.
+    Replaces the diagonal of adjacency matrix of the graph with the
+    weighted degree / number of vertices in graph. For directed graphs,
+    the weighted in and out degree is averaged.
 
     Modifies the provided graph in place as well as returning it.
 
-    :param networkx.Graph graph: The networkx graph to diagonally augment
-    :param str weight_column: The weight column to augment
-    :return: The networkx Graph object that was modified in place.
-    :rtype: networkx.Graph
+    :param: The networkx graph which will get a replaced diagonal
+    :type graph: Union[nx.Graph, nx.DiGraph]
+    :param str weight_column: The weight column of the edge
+    :return: The networkx Graph or DiGraph object that was modified in place.
+    :rtype: Union[nx.Graph, nx.DiGraph]
     """
     assertions.assert_is_graph(graph)
 
@@ -38,11 +38,16 @@ def self_loop_augmentation(
         if graph.has_edge(vertex, vertex):
             graph.remove_edge(vertex, vertex)
 
-        degree = graph.degree(vertex)
+        if isinstance(graph, nx.DiGraph):
+            in_degree = graph.in_degree(vertex, weight=weight_column)
+            out_degree = graph.out_degree(vertex, weight=weight_column)
+            weighted_degree = (in_degree + out_degree) / 2
+        else:
+            weighted_degree = graph.degree(vertex, weight=weight_column)
 
         # add the augmented weight back onto the diagonal
         graph.add_edge(vertex, vertex)
-        graph[vertex][vertex][weight_column] = degree / (vertex_count - 1)
+        graph[vertex][vertex][weight_column] = weighted_degree / (vertex_count - 1)
 
     return graph