Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions graphconstructor/operators/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .base import GraphOperator
from .disparity import DisparityFilter
from .doubly_stochastic import DoublyStochastic
from .doubly_stochastic import DoublyStochasticBackbone, DoublyStochasticNormalize
from .knn_selector import KNNSelector
from .locally_adaptive_sparsification import LocallyAdaptiveSparsification
from .marginal_likelihood import MarginalLikelihoodFilter
Expand All @@ -11,7 +11,8 @@

__all__ = [
"DisparityFilter",
"DoublyStochastic",
"DoublyStochasticNormalize",
"DoublyStochasticBackbone",
"GraphOperator",
"KNNSelector",
"LocallyAdaptiveSparsification",
Expand Down
183 changes: 138 additions & 45 deletions graphconstructor/operators/doubly_stochastic.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import warnings
from dataclasses import dataclass
import networkx as nx
import numpy as np
import scipy.sparse as sp
from ..graph import Graph
from .base import GraphOperator


# Z. 48-57:
# https://gitlab.liris.cnrs.fr/coregraphie/netbone/-/blob/main/netbone/structural/doubly_stochastic.py?ref_type=heads
@dataclass(slots=True)
class DoublyStochastic(GraphOperator):
class DoublyStochasticNormalize(GraphOperator):
"""
Alternating row/column normalization (Sinkhorn-Knopp) to make the adjacency
approximately doubly stochastic. Works with CSR without densifying.
Expand All @@ -33,7 +33,6 @@ class DoublyStochastic(GraphOperator):
Copy metadata frame if present. Default True.
"""

backbone_method: bool = False
tolerance: float = 1e-5
max_iter: int = 10_000
copy_meta: bool = True
Expand All @@ -44,9 +43,9 @@ def apply(self, G: Graph) -> Graph:
A = G.adj.tocsr(copy=False)

if A.shape[0] != A.shape[1]:
raise TypeError("DoublyStochastic requires a square adjacency.")
raise TypeError("DoublyStochasticNormalize requires a square adjacency.")
if (A.data < 0).any():
raise ValueError("DoublyStochastic requires nonnegative weights.")
raise ValueError("DoublyStochasticNormalize requires nonnegative weights.")

n = A.shape[0]
if A.nnz == 0:
Expand All @@ -67,87 +66,181 @@ def apply(self, G: Graph) -> Graph:
# Precompute nnz masks for convergence checks under sparsity
row_has_edges = np.array(A.sum(axis=1) > 0).T[0]
col_has_edges = np.array(A.sum(axis=0) > 0)[0]
# The csc conversions above could be heavy; build once

# Build transposed CSR once for repeated column-sum products
A_T = A.T.tocsr(copy=False)

min_thres = 1.0 - self.tolerance
max_thres = 1.0 + self.tolerance

# Parameter for stabilitation
# Parameter for stabilization
MAX_FACTOR = 1e50

converged = False
for _ in range(self.max_iter):
c[col_has_edges] = 1 / A_T.dot(r)[col_has_edges]
r[row_has_edges] = 1 / A.dot(c)[row_has_edges]

if np.any(np.abs(r) > MAX_FACTOR) or np.any(np.abs(c) > MAX_FACTOR):
break # avoid overflow; close enough
warnings.warn(
"DoublyStochasticNormalize stopped early because scaling factors "
"became very large. Result may not be doubly stochastic.",
RuntimeWarning,
)
break

# Convergence check (band and tol)
row_sums = r * (A.dot(c))
col_sums = c * (A_T.dot(r))

# Only check rows/cols that have edges (others stay 0 and are irrelevant)
if row_has_edges.any():
rows_ok = np.all((row_sums[row_has_edges] >= min_thres) & (row_sums[row_has_edges] <= max_thres))
rows_ok = np.all(
(row_sums[row_has_edges] >= min_thres)
& (row_sums[row_has_edges] <= max_thres)
)
else:
rows_ok = True

if col_has_edges.any():
cols_ok = np.all((col_sums[col_has_edges] >= min_thres) & (col_sums[col_has_edges] <= max_thres))
cols_ok = np.all(
(col_sums[col_has_edges] >= min_thres)
& (col_sums[col_has_edges] <= max_thres)
)
else:
cols_ok = True

if rows_ok and cols_ok:
converged = True
break

if not converged:
warnings.warn(
"DoublyStochasticNormalize did not converge within max_iter.",
RuntimeWarning,
)

# Apply scaling once: A' = diag(r) * A * diag(c) (CSR-friendly)
A_scaled = A.copy()

# row scaling
A_scaled.data *= np.repeat(r, np.diff(A_scaled.indptr))

# col scaling
A_scaled.data *= c[A_scaled.indices]

# step 2
if self.backbone_method:
i = 0

rows, cols = A_scaled.nonzero()
vals = A_scaled.data

order = np.argsort(vals)[::-1]
rows = rows[order]
cols = cols[order]
vals = vals[order]
if not G.directed:
G_filtered = nx.Graph()
while (
nx.number_connected_components(G_filtered) != 1
or len(G_filtered) < A_scaled.shape[0]
or not nx.is_connected(G_filtered)
):
if i == len(rows) or G_filtered.number_of_nodes() == len(set(rows)):
break
G_filtered.add_edge(rows[i], cols[i], weight=vals[i])
i += 1

# add isolated nodes
G_filtered.add_nodes_from(range(G.n_nodes))
G_csr = nx.to_scipy_sparse_array(G_filtered)
# TODO: For undirected graphs, Graph.from_csr symmetrizes A_scaled again.
# This may change row/column sums after normalization. A future revision should
# either use symmetric scaling or allow Graph construction without re-symmetrizing.
return Graph.from_csr(
A_scaled,
directed=G.directed,
weighted=True,
mode=G.mode,
meta=(G.meta.copy() if (self.copy_meta and G.meta is not None) else G.meta),
sym_op="max",
)

return Graph.from_csr(
G_csr,
directed=G.directed,
weighted=True,
mode=G.mode,
meta=(G.meta.copy() if (self.copy_meta and G.meta is not None) else G.meta),
sym_op="max",

@dataclass(slots=True)
class DoublyStochasticBackbone(GraphOperator):
"""
Backbone extraction based on doubly-stochastic edge scores.

First applies DoublyStochasticNormalize, then sorts edges by normalized score
and adds strongest edges until the graph becomes connected, or until no candidate
edges remain. For disconnected inputs, the result is a strongest-edge forest
over the available components.

References
----------
- Coscia, M. (2025). "The Atlas for the Inspiring Network Scientist."
- Yassin, A. (2023). "An evaluation tool for backbone extraction techniques
in weighted complex networks." Scientific Reports.

Parameters
----------
tolerance : float
Band tolerance passed to DoublyStochasticNormalize. Default 1e-5.
max_iter : int
Maximum Sinkhorn iterations passed to DoublyStochasticNormalize.
Default 10_000.
copy_meta : bool
Copy metadata frame if present. Default True.
"""

tolerance: float = 1e-5
max_iter: int = 10_000
copy_meta: bool = True
supported_modes = ["similarity"]

def apply(self, G: Graph) -> Graph:
self._check_mode_supported(G)

if G.directed:
raise NotImplementedError(
"DoublyStochasticBackbone currently supports only undirected graphs."
)

normalized = DoublyStochasticNormalize(
tolerance=self.tolerance,
max_iter=self.max_iter,
copy_meta=self.copy_meta,
).apply(G)

A_scaled = normalized.adj.tocsr(copy=False)
G_filtered = self._extract_undirected_backbone(A_scaled, G.n_nodes)

G_csr = nx.to_scipy_sparse_array(
G_filtered,
nodelist=list(range(G.n_nodes)),
weight="weight",
format="csr",
)

return Graph.from_csr(
A_scaled,
G_csr,
directed=G.directed,
weighted=True,
mode=G.mode,
meta=(G.meta.copy() if (self.copy_meta and G.meta is not None) else G.meta),
sym_op="max",
)

@staticmethod
def _extract_undirected_backbone(A_scaled, n_nodes: int) -> nx.Graph:
"""
Extract an undirected backbone from normalized edge scores.

Uses only the upper triangle of the adjacency matrix so that each
undirected edge is considered once.
"""
G_filtered = nx.Graph()
G_filtered.add_nodes_from(range(n_nodes))

if n_nodes <= 1 or A_scaled.nnz == 0:
return G_filtered

# Use only one orientation per undirected edge and ignore self-loops.
A_upper = sp.triu(A_scaled, k=1).tocoo()

if A_upper.nnz == 0:
return G_filtered

rows = A_upper.row
cols = A_upper.col
vals = A_upper.data

order = np.argsort(vals)[::-1]

for idx in order:
if nx.is_connected(G_filtered): # TODO: For large graphs, this check could be expensive
break

G_filtered.add_edge(
int(rows[idx]),
int(cols[idx]),
weight=float(vals[idx]),
)

return G_filtered
Loading
Loading