# Supplementary table 4: Searching for optimal Radii for LisaClust for benchmarking 

In [2]:
import sys
import os
import matplotlib.pyplot as plt 

from collections import Counter

# Third-party library imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
from glob import glob
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

# Local module imports
os.chdir('C:/Users/Adminn/Documents/GitHub/CEG/src')
from Graph_builder import *  # Import graph-building utilities
from CellECMGraphs_multiple import *  # Import Cell-ECM graph utilities
from Helper_functions import *  # Import helper functions
from SimData_Generator import *

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)


import torch 
from tqdm import tqdm 

In [3]:
# Load cell-ECM graph with ground truth regions 
ceg_gt = load_ceg("C:/Users/Adminn/Desktop/PhD/cell_ECM_graphs/Benchmarking/benchmark_data/benchmark_dataset.dill").ceg_dict[0]
ceg_gt.single = True # Auto visualization 

ground_truth = LabelEncoder().fit_transform(np.array([gt for n, gt in ceg_gt.G.nodes(data='ground_truth_label') if 'cell' in n]))


In [4]:
# Load predicted from LisaClust in R to graph 

LisaClust_Radi_Results_path = glob('C:/Users/Adminn/Documents/lisaClust_dfs/*csv*')
LisaClust_Radi_Results = [pd.read_csv(i) for i in LisaClust_Radi_Results_path]

Radii = [ 10, 100, 20, 200, 30,300, 40, 5, 50 ,60]

In [5]:
for r, r_df in zip(Radii, LisaClust_Radi_Results):
    count = 0 
    for n, attri in ceg_gt.G.nodes(data=True):
        if 'cell' in n:
            attri['LisaClust_' +str(r)] = r_df.iloc[count]['region']
            count += 1


In [6]:
Radii_results = {r:{'ARI': [], 'NMI': []} for r in Radii}

for r in Radii:
    r_name = 'LisaClust_' +str(r)
    LisaClust_predicted_leiden = LabelEncoder().fit_transform(np.array([ceg_gt.G.nodes[n][r_name] for n in ceg_gt.G.nodes if 'cell' in n]))
    ari_cell_leiden = adjusted_rand_score(ground_truth, LisaClust_predicted_leiden)
    nmi_cell_leiden = normalized_mutual_info_score(ground_truth, LisaClust_predicted_leiden)

    Radii_results[r]['ARI'] = (ari_cell_leiden)
    Radii_results[r]['NMI'] = (nmi_cell_leiden)



In [8]:
Radii_results_df = pd.DataFrame(Radii_results).T
Radii_results_df.columns = ['ARI', 'NMI']
Radii_results_df.sort_index(inplace=True)
Radii_results_df = Radii_results_df.round(4)
Radii_results_df.to_csv('D:/Supplementary_figures/LisaClust_Radii_optimization.csv')

In [9]:
Radii_results_df

Unnamed: 0,ARI,NMI
5,0.1241,0.2539
10,0.2008,0.3694
20,0.1688,0.3392
30,0.0923,0.2255
40,0.1668,0.3362
50,0.1273,0.2978
60,0.1118,0.2775
100,0.091,0.2346
200,0.0819,0.1377
300,0.0011,0.1103
