# Supplementary table 3: Leiden clustering, searching for optimal resolution 

In [35]:
import sys
import os
import matplotlib.pyplot as plt 

from collections import Counter

# Third-party library imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
from glob import glob
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

# Local module imports
os.chdir('C:/Users/Adminn/Documents/GitHub/CEG/src')
from Graph_builder import *  # Import graph-building utilities
from CellECMGraphs_multiple import *  # Import Cell-ECM graph utilities
from Helper_functions import *  # Import helper functions
from SimData_Generator import *

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)


import torch 
from tqdm import tqdm 

In [36]:
# Load cell-ECM graph with ground truth regions 
ceg_gt = load_ceg("C:/Users/Adminn/Desktop/PhD/cell_ECM_graphs/Benchmarking/benchmark_data/benchmark_dataset.dill").ceg_dict[0]
ceg_gt.single = True # Auto visualization 

ground_truth = LabelEncoder().fit_transform(np.array([gt for n, gt in ceg_gt.G.nodes(data='ground_truth_label') if 'cell' in n]))


Cell graph 

In [37]:
# Leiden clustering on cell-ECM graph

import igraph as ig
import leidenalg

cell_r_results = {'nmi': [], 'ari': []}

for resolution in [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]:
    # Extract adjacency matrix for cell nodes only
    cell_nodes = [n for n in ceg_gt.cell_G.nodes if 'cell' in n]
    cell_indices = {n: i for i, n in enumerate(cell_nodes)}

    # Build igraph from networkx subgraph of cell nodes
    cell_subgraph = ceg_gt.G.subgraph(cell_nodes)
    edges = [(cell_indices[u], cell_indices[v]) for u, v in cell_subgraph.edges()]
    g_ig = ig.Graph(edges=edges, directed=False)

    # Run Leiden clustering (set number of clusters or use default resolution)
    leiden_partition = leidenalg.find_partition(g_ig, leidenalg.RBConfigurationVertexPartition, resolution_parameter=resolution)
    leiden_labels = np.array(leiden_partition.membership)

    # Assign Leiden cluster labels back to nodes
    for n, label in zip(cell_nodes, leiden_labels):
        ceg_gt.G.nodes[n]['cell_leiden_label'] = label

    cell_predicted_leiden = LabelEncoder().fit_transform(np.array([ceg_gt.G.nodes[n]['cell_leiden_label'] for n in ceg_gt.G.nodes if 'cell' in n]))

    ari_cell_leiden = adjusted_rand_score(ground_truth, cell_predicted_leiden)
    nmi_cell_leiden = normalized_mutual_info_score(ground_truth, cell_predicted_leiden)
    cell_r_results['nmi'].append(nmi_cell_leiden)
    cell_r_results['ari'].append(ari_cell_leiden)

    

In [38]:

cell_r_results # 0.001 optimal resolution for leiden clustering on cell graph 



{'nmi': [0.33162094348828497,
  0.23360127614675882,
  0.18210306238319973,
  0.169027624893593,
  0.15323353933379302,
  0.13701033471532004],
 'ari': [0.3026363963116996,
  0.04623067491232834,
  0.03394721202345103,
  0.01274031823212024,
  0.004858777839169428,
  0.001754970605490017]}

cell-ECM graph

In [39]:

cell_ecm_r_results = {'nmi': [], 'ari': []}

for resolution in [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]:
    # Use all nodes (cell and ECM)
    all_nodes = list(ceg_gt.G.nodes)
    node_indices = {n: i for i, n in enumerate(all_nodes)}

    # Build igraph from networkx subgraph of all nodes
    edges = [(node_indices[u], node_indices[v]) for u, v in ceg_gt.G.edges()]
    g_ig = ig.Graph(edges=edges, directed=False)

    # Run Leiden clustering (set resolution as needed)
    leiden_partition = leidenalg.find_partition(g_ig, leidenalg.RBConfigurationVertexPartition, resolution_parameter=resolution)
    leiden_labels = np.array(leiden_partition.membership)

    # Assign Leiden cluster labels back to nodes
    for n, label in zip(all_nodes, leiden_labels):
        ceg_gt.G.nodes[n]['cell_ecm_leiden_label'] = label
    

    cell_ecm_predicted_leiden = LabelEncoder().fit_transform(np.array([ceg_gt.G.nodes[n]['cell_ecm_leiden_label'] for n in ceg_gt.G.nodes if 'cell' in n]))
    ari_cell_ecm_leiden = adjusted_rand_score(ground_truth, cell_ecm_predicted_leiden)
    nmi_cell_ecm_leiden = normalized_mutual_info_score(ground_truth, cell_ecm_predicted_leiden)
    cell_ecm_r_results['nmi'].append(nmi_cell_ecm_leiden)
    cell_ecm_r_results['ari'].append(ari_cell_ecm_leiden)

In [40]:
# Combined cell_r_results and cell_ecm_r_results into single df 
resolution_opti_results = pd.DataFrame({
    'resolution': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0],
    'cell_nmi': cell_r_results['nmi'],
    'cell_ari': cell_r_results['ari'],
    'cell_ecm_nmi': cell_ecm_r_results['nmi'],
    'cell_ecm_ari': cell_ecm_r_results['ari']
})


In [41]:
resolution_opti_results

Unnamed: 0,resolution,cell_nmi,cell_ari,cell_ecm_nmi,cell_ecm_ari
0,0.0001,0.331621,0.302636,0.382726,0.310716
1,0.001,0.233601,0.046231,0.382726,0.310716
2,0.01,0.182103,0.033947,0.173577,0.010366
3,0.1,0.169028,0.01274,0.167543,0.014506
4,1.0,0.153234,0.004859,0.152077,0.005408
5,10.0,0.13701,0.001755,0.141545,0.002134


In [None]:

resolution_opti_results.to_csv('D:/Supplementary_figures/leiden_resolution_optimization.csv')