diff --git a/graphconstructor/operators/__init__.py b/graphconstructor/operators/__init__.py index 9e3c09e..88475b6 100644 --- a/graphconstructor/operators/__init__.py +++ b/graphconstructor/operators/__init__.py @@ -1,6 +1,6 @@ from .base import GraphOperator from .disparity import DisparityFilter -from .doubly_stochastic import DoublyStochastic +from .doubly_stochastic import DoublyStochasticBackbone, DoublyStochasticNormalize from .knn_selector import KNNSelector from .locally_adaptive_sparsification import LocallyAdaptiveSparsification from .marginal_likelihood import MarginalLikelihoodFilter @@ -11,7 +11,8 @@ __all__ = [ "DisparityFilter", - "DoublyStochastic", + "DoublyStochasticNormalize", + "DoublyStochasticBackbone", "GraphOperator", "KNNSelector", "LocallyAdaptiveSparsification", diff --git a/graphconstructor/operators/doubly_stochastic.py b/graphconstructor/operators/doubly_stochastic.py index 89341eb..406525c 100644 --- a/graphconstructor/operators/doubly_stochastic.py +++ b/graphconstructor/operators/doubly_stochastic.py @@ -1,14 +1,14 @@ +import warnings from dataclasses import dataclass import networkx as nx import numpy as np +import scipy.sparse as sp from ..graph import Graph from .base import GraphOperator -# Z. 48-57: -# https://gitlab.liris.cnrs.fr/coregraphie/netbone/-/blob/main/netbone/structural/doubly_stochastic.py?ref_type=heads @dataclass(slots=True) -class DoublyStochastic(GraphOperator): +class DoublyStochasticNormalize(GraphOperator): """ Alternating row/column normalization (Sinkhorn-Knopp) to make the adjacency approximately doubly stochastic. Works with CSR without densifying. @@ -33,7 +33,6 @@ class DoublyStochastic(GraphOperator): Copy metadata frame if present. Default True. """ - backbone_method: bool = False tolerance: float = 1e-5 max_iter: int = 10_000 copy_meta: bool = True @@ -44,9 +43,9 @@ def apply(self, G: Graph) -> Graph: A = G.adj.tocsr(copy=False) if A.shape[0] != A.shape[1]: - raise TypeError("DoublyStochastic requires a square adjacency.") + raise TypeError("DoublyStochasticNormalize requires a square adjacency.") if (A.data < 0).any(): - raise ValueError("DoublyStochastic requires nonnegative weights.") + raise ValueError("DoublyStochasticNormalize requires nonnegative weights.") n = A.shape[0] if A.nnz == 0: @@ -67,20 +66,28 @@ def apply(self, G: Graph) -> Graph: # Precompute nnz masks for convergence checks under sparsity row_has_edges = np.array(A.sum(axis=1) > 0).T[0] col_has_edges = np.array(A.sum(axis=0) > 0)[0] - # The csc conversions above could be heavy; build once + + # Build transposed CSR once for repeated column-sum products A_T = A.T.tocsr(copy=False) min_thres = 1.0 - self.tolerance max_thres = 1.0 + self.tolerance - # Parameter for stabilitation + # Parameter for stabilization MAX_FACTOR = 1e50 + converged = False for _ in range(self.max_iter): c[col_has_edges] = 1 / A_T.dot(r)[col_has_edges] r[row_has_edges] = 1 / A.dot(c)[row_has_edges] + if np.any(np.abs(r) > MAX_FACTOR) or np.any(np.abs(c) > MAX_FACTOR): - break # avoid overflow; close enough + warnings.warn( + "DoublyStochasticNormalize stopped early because scaling factors " + "became very large. Result may not be doubly stochastic.", + RuntimeWarning, + ) + break # Convergence check (band and tol) row_sums = r * (A.dot(c)) @@ -88,66 +95,152 @@ def apply(self, G: Graph) -> Graph: # Only check rows/cols that have edges (others stay 0 and are irrelevant) if row_has_edges.any(): - rows_ok = np.all((row_sums[row_has_edges] >= min_thres) & (row_sums[row_has_edges] <= max_thres)) + rows_ok = np.all( + (row_sums[row_has_edges] >= min_thres) + & (row_sums[row_has_edges] <= max_thres) + ) else: rows_ok = True if col_has_edges.any(): - cols_ok = np.all((col_sums[col_has_edges] >= min_thres) & (col_sums[col_has_edges] <= max_thres)) + cols_ok = np.all( + (col_sums[col_has_edges] >= min_thres) + & (col_sums[col_has_edges] <= max_thres) + ) else: cols_ok = True if rows_ok and cols_ok: + converged = True break + if not converged: + warnings.warn( + "DoublyStochasticNormalize did not converge within max_iter.", + RuntimeWarning, + ) + # Apply scaling once: A' = diag(r) * A * diag(c) (CSR-friendly) A_scaled = A.copy() + # row scaling A_scaled.data *= np.repeat(r, np.diff(A_scaled.indptr)) + # col scaling A_scaled.data *= c[A_scaled.indices] - # step 2 - if self.backbone_method: - i = 0 - - rows, cols = A_scaled.nonzero() - vals = A_scaled.data - - order = np.argsort(vals)[::-1] - rows = rows[order] - cols = cols[order] - vals = vals[order] - if not G.directed: - G_filtered = nx.Graph() - while ( - nx.number_connected_components(G_filtered) != 1 - or len(G_filtered) < A_scaled.shape[0] - or not nx.is_connected(G_filtered) - ): - if i == len(rows) or G_filtered.number_of_nodes() == len(set(rows)): - break - G_filtered.add_edge(rows[i], cols[i], weight=vals[i]) - i += 1 - - # add isolated nodes - G_filtered.add_nodes_from(range(G.n_nodes)) - G_csr = nx.to_scipy_sparse_array(G_filtered) + # TODO: For undirected graphs, Graph.from_csr symmetrizes A_scaled again. + # This may change row/column sums after normalization. A future revision should + # either use symmetric scaling or allow Graph construction without re-symmetrizing. + return Graph.from_csr( + A_scaled, + directed=G.directed, + weighted=True, + mode=G.mode, + meta=(G.meta.copy() if (self.copy_meta and G.meta is not None) else G.meta), + sym_op="max", + ) - return Graph.from_csr( - G_csr, - directed=G.directed, - weighted=True, - mode=G.mode, - meta=(G.meta.copy() if (self.copy_meta and G.meta is not None) else G.meta), - sym_op="max", + +@dataclass(slots=True) +class DoublyStochasticBackbone(GraphOperator): + """ + Backbone extraction based on doubly-stochastic edge scores. + + First applies DoublyStochasticNormalize, then sorts edges by normalized score + and adds strongest edges until the graph becomes connected, or until no candidate + edges remain. For disconnected inputs, the result is a strongest-edge forest + over the available components. + + References + ---------- + - Coscia, M. (2025). "The Atlas for the Inspiring Network Scientist." + - Yassin, A. (2023). "An evaluation tool for backbone extraction techniques + in weighted complex networks." Scientific Reports. + + Parameters + ---------- + tolerance : float + Band tolerance passed to DoublyStochasticNormalize. Default 1e-5. + max_iter : int + Maximum Sinkhorn iterations passed to DoublyStochasticNormalize. + Default 10_000. + copy_meta : bool + Copy metadata frame if present. Default True. + """ + + tolerance: float = 1e-5 + max_iter: int = 10_000 + copy_meta: bool = True + supported_modes = ["similarity"] + + def apply(self, G: Graph) -> Graph: + self._check_mode_supported(G) + + if G.directed: + raise NotImplementedError( + "DoublyStochasticBackbone currently supports only undirected graphs." ) + normalized = DoublyStochasticNormalize( + tolerance=self.tolerance, + max_iter=self.max_iter, + copy_meta=self.copy_meta, + ).apply(G) + + A_scaled = normalized.adj.tocsr(copy=False) + G_filtered = self._extract_undirected_backbone(A_scaled, G.n_nodes) + + G_csr = nx.to_scipy_sparse_array( + G_filtered, + nodelist=list(range(G.n_nodes)), + weight="weight", + format="csr", + ) + return Graph.from_csr( - A_scaled, + G_csr, directed=G.directed, weighted=True, mode=G.mode, meta=(G.meta.copy() if (self.copy_meta and G.meta is not None) else G.meta), sym_op="max", ) + + @staticmethod + def _extract_undirected_backbone(A_scaled, n_nodes: int) -> nx.Graph: + """ + Extract an undirected backbone from normalized edge scores. + + Uses only the upper triangle of the adjacency matrix so that each + undirected edge is considered once. + """ + G_filtered = nx.Graph() + G_filtered.add_nodes_from(range(n_nodes)) + + if n_nodes <= 1 or A_scaled.nnz == 0: + return G_filtered + + # Use only one orientation per undirected edge and ignore self-loops. + A_upper = sp.triu(A_scaled, k=1).tocoo() + + if A_upper.nnz == 0: + return G_filtered + + rows = A_upper.row + cols = A_upper.col + vals = A_upper.data + + order = np.argsort(vals)[::-1] + + for idx in order: + if nx.is_connected(G_filtered): # TODO: For large graphs, this check could be expensive + break + + G_filtered.add_edge( + int(rows[idx]), + int(cols[idx]), + weight=float(vals[idx]), + ) + + return G_filtered diff --git a/tests/test_doubly_stochastic.py b/tests/test_doubly_stochastic.py index a3ffffd..2a5c792 100644 --- a/tests/test_doubly_stochastic.py +++ b/tests/test_doubly_stochastic.py @@ -3,7 +3,7 @@ import pytest import scipy.sparse as sp from graphconstructor import Graph -from graphconstructor.operators import DoublyStochastic +from graphconstructor.operators import DoublyStochasticBackbone, DoublyStochasticNormalize def _csr(data, rows, cols, n): @@ -29,7 +29,7 @@ def test_doubly_stochastic_converges_on_positive_dense(): np.fill_diagonal(M, 0.0) G0 = Graph.from_dense(M, directed=False, weighted=True, mode="similarity", sym_op="max") - op = DoublyStochastic(tolerance=1e-6, max_iter=10_000) + op = DoublyStochasticNormalize(tolerance=1e-6, max_iter=10_000) G = op.apply(G0) A = G.adj @@ -61,9 +61,15 @@ def test_doubly_stochastic_with_backbone_method(): # Zero diagonal np.fill_diagonal(M, 0.0) - G0 = Graph.from_dense(M, directed=False, weighted=True, mode="similarity", sym_op="max") + G0 = Graph.from_dense( + M, + directed=False, + weighted=True, + mode="similarity", + sym_op="max", + ) - op = DoublyStochastic(tolerance=1e-6, max_iter=10_000, backbone_method=True) + op = DoublyStochasticBackbone(tolerance=1e-6, max_iter=10_000) G = op.apply(G0) A = G.adj @@ -71,19 +77,17 @@ def test_doubly_stochastic_with_backbone_method(): # No NaNs or infs assert np.isfinite(A.data).all() - # Check that only backbone edges remain (sparser than original) + # Backbone should be no denser than original graph assert A.nnz <= G0.adj.nnz - # Rows/cols should still approximately sum to 1 (on non-isolated nodes) - row_sums = np.asarray(A.sum(axis=1)).ravel() - col_sums = np.asarray(A.sum(axis=0)).ravel() + # For this connected 4-node graph, the backbone should connect all nodes. + assert G.is_connected() - # Only check nodes that still have edges - nonzero_rows = row_sums > 0 - nonzero_cols = col_sums > 0 - - assert np.allclose(row_sums[nonzero_rows], row_sums[nonzero_rows][0], atol=1e-6) - assert np.allclose(col_sums[nonzero_cols], col_sums[nonzero_cols][0], atol=1e-6) + # The backbone implementation adds strongest edges until + # connected, a connected undirected graph should end up with at least n-1 + # and at most the original number of edges. + assert G.n_edges >= G.n_nodes - 1 + assert G.n_edges <= G0.n_edges # Graph properties preserved assert not G.directed and G.weighted @@ -100,7 +104,7 @@ def test_doubly_stochastic_sparse_with_isolates(): ) G0 = Graph.from_csr(A, directed=False, weighted=True, mode="similarity", sym_op="max") - op = DoublyStochastic(tolerance=1e-6, max_iter=10_000) + op = DoublyStochasticNormalize(tolerance=1e-6, max_iter=10_000) G = op.apply(G0) A2 = G.adj @@ -136,7 +140,7 @@ def test_doubly_stochastic_sparse_with_isolates_backbone(): ) G0 = Graph.from_csr(A, directed=False, weighted=True, mode="similarity", sym_op="max") - op = DoublyStochastic(tolerance=1e-6, max_iter=10_000, backbone_method=True) + op = DoublyStochasticBackbone(tolerance=1e-6, max_iter=10_000) G = op.apply(G0) A2 = G.adj @@ -159,6 +163,60 @@ def test_doubly_stochastic_sparse_with_isolates_backbone(): assert len(G_nx.edges(4)) == 0 +def test_doubly_stochastic_backbone_keeps_adding_edges_until_connected(): + # This graph has two very strong edges, 0--1 and 2--3, plus one weak + # bridge, 1--2. + A = _csr( + data=[100.0, 100.0, 100.0, 100.0, 1.0, 1.0], + rows=[0, 1, 2, 3, 1, 2], + cols=[1, 0, 3, 2, 2, 1], + n=4, + ) + G0 = Graph.from_csr(A, directed=False, weighted=True, mode="similarity", sym_op="max") + + op = DoublyStochasticBackbone(tolerance=1e-6, max_iter=10_000) + G = op.apply(G0) + + assert G.is_connected() + assert G.n_edges == 3 + + # The weak bridge is required for connectivity and should not be skipped. + assert G.adj[1, 2] > 0 + assert G.adj[2, 1] > 0 + + +def test_doubly_stochastic_backbone_preserves_node_order_after_networkx_conversion(): + # Node 4 is part of the strongest edge. Without an explicit nodelist in + # nx.to_scipy_sparse_array, NetworkX insertion order can remap this edge + # to different matrix indices. + A = _csr( + data=[10.0, 10.0, 1.0, 1.0, 1.0, 1.0], + rows=[0, 4, 0, 1, 1, 4], + cols=[4, 0, 1, 0, 4, 1], + n=5, + ) + G0 = Graph.from_csr(A, directed=False, weighted=True, mode="similarity", sym_op="max") + + op = DoublyStochasticBackbone(tolerance=1e-6, max_iter=10_000) + G = op.apply(G0) + + assert G.adj.shape == (5, 5) + + # Node labels should still correspond to the original matrix indices. + # In particular, node 4 should still be connected to the component, + # not silently remapped to another row/column. + assert G.adj[0, 4] > 0 or G.adj[1, 4] > 0 + assert np.asarray(G.adj[4].sum()).ravel()[0] > 0 + + # Node 2 and node 3 were isolated in the input and should remain isolated. + row_sums = np.asarray(G.adj.sum(axis=1)).ravel() + col_sums = np.asarray(G.adj.sum(axis=0)).ravel() + + assert row_sums[2] == 0.0 + assert col_sums[2] == 0.0 + assert row_sums[3] == 0.0 + assert col_sums[3] == 0.0 + # ----------------- Directed case: rows and cols ~1 for nonzero rows/cols ----------------- def test_doubly_stochastic_directed_graph_unsolvable(): # Directed 4x4 with zeros on diagonal, not symmetric @@ -171,7 +229,7 @@ def test_doubly_stochastic_directed_graph_unsolvable(): ) G0 = Graph.from_csr(A, directed=True, weighted=True, mode="similarity") - op = DoublyStochastic(tolerance=1e-6, max_iter=10_000) + op = DoublyStochasticNormalize(tolerance=1e-6, max_iter=10_000) G = op.apply(G0) A2 = G.adj @@ -189,7 +247,7 @@ def test_doubly_stochastic_directed_graph_unsolvable(): def test_doubly_stochastic_rejects_negative_weights(): A = _csr([-0.2, 0.5], [0, 1], [1, 0], 2) G0 = Graph.from_csr(A, directed=True, weighted=True, mode="similarity", sym_op="max") - op = DoublyStochastic() + op = DoublyStochasticNormalize() with pytest.raises(ValueError, match="nonnegative"): op.apply(G0) @@ -197,7 +255,7 @@ def test_doubly_stochastic_rejects_negative_weights(): def test_doubly_stochastic_rejects_distances(): A = _csr([0.2, 0.5], [0, 1], [1, 0], 2) G0 = Graph.from_csr(A, directed=True, weighted=True, mode="distance", sym_op="max") - op = DoublyStochastic() + op = DoublyStochasticNormalize() with pytest.raises(ValueError, match="only supports modes"): op.apply(G0) @@ -206,7 +264,7 @@ def test_doubly_stochastic_rejects_distances(): def test_doubly_stochastic_all_zero_matrix_noop(): A = sp.csr_matrix((4, 4), dtype=float) G0 = Graph.from_csr(A, directed=False, weighted=True, mode="similarity", sym_op="max") - op = DoublyStochastic() + op = DoublyStochasticNormalize() G = op.apply(G0) assert G.adj.nnz == 0 assert G.adj.shape == (4, 4) @@ -219,7 +277,7 @@ def test_doubly_stochastic_preserves_flags_and_copies_metadata(): A = _csr([0.4, 0.6, 0.3], [0, 1, 2], [1, 2, 0], 3) G0 = Graph.from_csr(A, directed=False, weighted=True, mode="similarity", meta=meta, sym_op="max") - op = DoublyStochastic(tolerance=1e-6, max_iter=10_000, copy_meta=True) + op = DoublyStochasticNormalize(tolerance=1e-6, max_iter=10_000, copy_meta=True) G = op.apply(G0) # Flags