### 0. import packages and select GPU if accessible

In [None]:
import os,csv,re
import pandas as pd
import numpy as np
import scanpy as sc
import math
import SpaGCN as spg
from scipy.sparse import issparse
import random, torch
import warnings
warnings.filterwarnings("ignore")
import matplotlib.colors as clr
import matplotlib.pyplot as plt
import SpaGCN as spg
#In order to read in image data, we need to install some package. Here we recommend package "opencv"
#inatll opencv in python
#!pip3 install opencv-python
import cv2
from sklearn.metrics import adjusted_rand_score
from st_loading_utils import load_DLPFC, load_BC, load_mVC, load_mPFC, load_mHypothalamus, load_her2_tumor, load_mMAMP

# Run device, by default, the package is implemented on 'cpu'. We recommend using GPU.
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')


In [None]:
iters = 1 # for script testing

### 1. DLPFC dataset (12 slides)

change '${dir_}' to  'path/to/your/DLPFC/data'

In [None]:
"""DLPFC"""
setting_combinations = [[7, '151507'], [7, '151508'], [7, '151509'], [7, '151510'], [5, '151669'], [5, '151670'], [5, '151671'], [5, '151672'], [7, '151673'], [7, '151674'], [7, '151675'], [7, '151676']]
for setting_combi in setting_combinations:
   n_clusters = setting_combi[0]  # 7

   dataset = setting_combi[1]  # '151673'
   save_path = '../results/' + dataset + '/'
   
   dir_ = './benchmarking_data/DLPFC12'
   adata = load_DLPFC(root_dir=dir_, section_id=dataset)
   aris = []
   
   try:
      img=cv2.imread(os.path.join(dir_, dataset, dataset + '_full_image.tif'))
   except:
      img = None
   s=1
   b=49
   # print(adata.obsm['spatial'].shape)
   x_array=adata.obs["array_row"].tolist()
   y_array=adata.obs["array_col"].tolist()
   x_pixel=adata.obsm["spatial"][:, 0].tolist()
   y_pixel=adata.obsm["spatial"][:, 1].tolist()
   adj=spg.calculate_adj_matrix(x=x_pixel,y=y_pixel, x_pixel=x_pixel, y_pixel=y_pixel, image=img, beta=b, alpha=s, histology=True)
   spg.prefilter_genes(adata,min_cells=3) # avoiding all genes are zeros
   spg.prefilter_specialgenes(adata)
   #Normalize and take log for UMI
   sc.pp.normalize_per_cell(adata)
   sc.pp.log1p(adata)
   p=0.5 
   #Find the l value given p
   l=spg.search_l(p, adj, start=0.01, end=1000, tol=0.01, max_run=100)
   #Set seed
   r_seed=t_seed=n_seed=100
   #Seaech for suitable resolution
   res=spg.search_res(adata, adj, l, n_clusters, start=0.7, step=0.1, tol=5e-3, lr=0.05, max_epochs=20, r_seed=r_seed, t_seed=t_seed, n_seed=n_seed)
   for iter in range(iters):
      clf=spg.SpaGCN()
      clf.set_l(l)
      #Set seed
      random.seed(r_seed)
      torch.manual_seed(t_seed)
      np.random.seed(n_seed)
      #Run
      clf.train(adata,adj,init_spa=True,init="louvain",res=res, tol=5e-3, lr=0.05, max_epochs=200)
      y_pred, prob=clf.predict()
      adata.obs["pred"]= y_pred
      adata.obs["pred"]=adata.obs["pred"].astype('category')
      #Do cluster refinement(optional)
      #shape="hexagon" for Visium data, "square" for ST data.
      adj_2d=spg.calculate_adj_matrix(x=x_array,y=y_array, histology=False)
      refined_pred=spg.refine(sample_id=adata.obs.index.tolist(), pred=adata.obs["pred"].tolist(), dis=adj_2d, shape="hexagon")
      adata.obs["refined_pred"]=refined_pred
      adata.obs["refined_pred"]=adata.obs["refined_pred"].astype('category')
      ARI = adjusted_rand_score(adata.obs["refined_pred"], adata.obs["original_clusters"])
      aris.append(ARI)
      print('Dataset:', dataset)
      print(ARI)
   print('Dataset:', dataset)
   print(aris)
   print(np.mean(aris))
   with open('spagcn_aris.txt', 'a+') as fp:
      fp.write('DLPFC' + dataset + ' ')
      fp.write(' '.join([str(i) for i in aris]))
      fp.write('\n')

### 2. BC/MA datasets (2 slides)

In [None]:
"""BC"""
setting_combinations = [[20, 'section1']]
for setting_combi in setting_combinations:
   n_clusters = setting_combi[0] 

   dataset = setting_combi[1]
   
   dir_ = './benchmarking_data/BC'
   adata = load_BC(root_dir=dir_, section_id=dataset)

   aris = []
   try:
      img=cv2.imread(os.path.join(dir_, dataset, dataset + '_full_image.tif'))
   except:
      img = None
   s=1
   b=49
   # print(adata.obsm['spatial'].shape)
   x_array=adata.obs["array_row"].tolist()
   y_array=adata.obs["array_col"].tolist()
   x_pixel=adata.obsm["spatial"][:, 0].tolist()
   y_pixel=adata.obsm["spatial"][:, 1].tolist()
   adj=spg.calculate_adj_matrix(x=x_pixel,y=y_pixel, x_pixel=x_pixel, y_pixel=y_pixel, image=img, beta=b, alpha=s, histology=True)
   spg.prefilter_genes(adata,min_cells=3) # avoiding all genes are zeros
   spg.prefilter_specialgenes(adata)
   #Normalize and take log for UMI
   sc.pp.normalize_per_cell(adata)
   sc.pp.log1p(adata)
   p=0.5 
   #Find the l value given p
   l=spg.search_l(p, adj, start=0.01, end=1000, tol=0.01, max_run=100)
   #Set seed
   r_seed=t_seed=n_seed=100
   #Seaech for suitable resolution
   res=spg.search_res(adata, adj, l, n_clusters, start=0.7, step=0.1, tol=5e-3, lr=0.05, max_epochs=20, r_seed=r_seed, t_seed=t_seed, n_seed=n_seed)
   for iter in range(iters):
      clf=spg.SpaGCN()
      clf.set_l(l)
      #Set seed
      random.seed(r_seed)
      torch.manual_seed(t_seed)
      np.random.seed(n_seed)
      #Run
      clf.train(adata,adj,init_spa=True,init="louvain",res=res, tol=5e-3, lr=0.05, max_epochs=200)
      y_pred, prob=clf.predict()
      adata.obs["pred"]= y_pred
      adata.obs["pred"]=adata.obs["pred"].astype('category')
      #Do cluster refinement(optional)
      #shape="hexagon" for Visium data, "square" for ST data.
      adj_2d=spg.calculate_adj_matrix(x=x_array,y=y_array, histology=False)
      refined_pred=spg.refine(sample_id=adata.obs.index.tolist(), pred=adata.obs["pred"].tolist(), dis=adj_2d, shape="hexagon")
      adata.obs["refined_pred"]=refined_pred
      adata.obs["refined_pred"]=adata.obs["refined_pred"].astype('category')
      ARI = adjusted_rand_score(adata.obs["refined_pred"], adata.obs["original_clusters"])
      aris.append(ARI)
      print('Dataset:', dataset)
      print(ARI)
   print('Dataset:', dataset)
   print(aris)
   print(np.mean(aris))
   with open('spagcn_aris.txt', 'a+') as fp:
      fp.write('HBRC1 ')
      fp.write(' '.join([str(i) for i in aris]))
      fp.write('\n')

In [None]:
"""load MA section"""
setting_combinations = [[52, 'MA']]
for setting_combi in setting_combinations:
   n_clusters = setting_combi[0]

   dataset = setting_combi[1]
   
   dir_ = './benchmarking_data/mMAMP'
   adata = load_mMAMP(root_dir=dir_, section_id=dataset)

   aris = []
   try:
      img=cv2.imread(os.path.join(dir_, dataset, dataset + '_full_image.tif'))
   except:
      img = None
   s=1
   b=49
   # print(adata.obsm['spatial'].shape)
   x_array=adata.obs["array_row"].tolist()
   y_array=adata.obs["array_col"].tolist()
   x_pixel=adata.obsm["spatial"][:, 0].tolist()
   y_pixel=adata.obsm["spatial"][:, 1].tolist()
   if img is None:
      adj=spg.calculate_adj_matrix(x=x_pixel,y=y_pixel, x_pixel=x_pixel, y_pixel=y_pixel, image=img, beta=b, alpha=s, histology=False)
   else:
      adj=spg.calculate_adj_matrix(x=x_pixel,y=y_pixel, x_pixel=x_pixel, y_pixel=y_pixel, image=img, beta=b, alpha=s, histology=True)
   spg.prefilter_genes(adata,min_cells=3) # avoiding all genes are zeros
   spg.prefilter_specialgenes(adata)
   #Normalize and take log for UMI
   sc.pp.normalize_per_cell(adata)
   sc.pp.log1p(adata)
   p=0.5 
   #Find the l value given p
   l=spg.search_l(p, adj, start=0.01, end=1000, tol=0.01, max_run=100)
   #Set seed
   r_seed=t_seed=n_seed=100
   #Seaech for suitable resolution
   res=spg.search_res(adata, adj, l, n_clusters, start=0.7, step=0.1, tol=5e-3, lr=0.05, max_epochs=20, r_seed=r_seed, t_seed=t_seed, n_seed=n_seed)
   for iter in range(iters):
      clf=spg.SpaGCN()
      clf.set_l(l)
      #Set seed
      random.seed(r_seed)
      torch.manual_seed(t_seed)
      np.random.seed(n_seed)
      #Run
      clf.train(adata,adj,init_spa=True,init="louvain",res=res, tol=5e-3, lr=0.05, max_epochs=200)
      y_pred, prob=clf.predict()
      adata.obs["pred"]= y_pred
      adata.obs["pred"]=adata.obs["pred"].astype('category')
      #Do cluster refinement(optional)
      #shape="hexagon" for Visium data, "square" for ST data.
      adj_2d=spg.calculate_adj_matrix(x=x_array,y=y_array, histology=False)
      refined_pred=spg.refine(sample_id=adata.obs.index.tolist(), pred=adata.obs["pred"].tolist(), dis=adj_2d, shape="hexagon")
      adata.obs["refined_pred"]=refined_pred
      adata.obs["refined_pred"]=adata.obs["refined_pred"].astype('category')
      ARI = adjusted_rand_score(adata.obs["refined_pred"], adata.obs["original_clusters"])
      aris.append(ARI)
      print('Dataset:', dataset)
      print(ARI)
   print('Dataset:', dataset)
   print(aris)
   print(np.mean(aris))
   with open('spagcn_aris.txt', 'a+') as fp:
      fp.write('mABC ')
      fp.write(' '.join([str(i) for i in aris]))
      fp.write('\n')

### 3. mVC/mPFC datasets (4 slides)

In [None]:
"""mVC"""
setting_combinations = [[7, 'STARmap_20180505_BY3_1k.h5ad']]
for setting_combi in setting_combinations:
   n_clusters = setting_combi[0]

   dataset = setting_combi[1]
   
   dir_ = './benchmarking_data/STARmap_mouse_visual_cortex'
   adata = load_mVC(root_dir=dir_, section_id=dataset)

   aris = []
   try:
      img=cv2.imread(os.path.join(dir_, dataset, dataset + '_full_image.tif'))
   except:
      img = None
   s=1
   b=49
   # print(adata.obsm['spatial'].shape)
   x_array=adata.obs["X"].tolist()
   y_array=adata.obs["Y"].tolist()
   x_pixel=adata.obs["X"].tolist()
   y_pixel=adata.obs["Y"].tolist()
   if img == None:
      adj=spg.calculate_adj_matrix(x=x_pixel,y=y_pixel, x_pixel=x_pixel, y_pixel=y_pixel, image=img, beta=b, alpha=s, histology=False)
   else:
      adj=spg.calculate_adj_matrix(x=x_pixel,y=y_pixel, x_pixel=x_pixel, y_pixel=y_pixel, image=img, beta=b, alpha=s, histology=True)
   spg.prefilter_genes(adata,min_cells=3) # avoiding all genes are zeros
   spg.prefilter_specialgenes(adata)
   #Normalize and take log for UMI
   sc.pp.normalize_per_cell(adata)
   sc.pp.log1p(adata)
   p=0.5 
   #Find the l value given p
   l=spg.search_l(p, adj, start=0.01, end=1000, tol=0.01, max_run=100)
   #Set seed
   r_seed=t_seed=n_seed=100
   #Seaech for suitable resolution
   res=spg.search_res(adata, adj, l, n_clusters, start=0.7, step=0.1, tol=5e-3, lr=0.05, max_epochs=20, r_seed=r_seed, t_seed=t_seed, n_seed=n_seed)
   for iter in range(iters):
      clf=spg.SpaGCN()
      clf.set_l(l)
      #Set seed
      random.seed(r_seed)
      torch.manual_seed(t_seed)
      np.random.seed(n_seed)
      #Run
      clf.train(adata,adj,init_spa=True,init="louvain",res=res, tol=5e-3, lr=0.05, max_epochs=200)
      y_pred, prob=clf.predict()
      adata.obs["pred"]= y_pred
      adata.obs["pred"]=adata.obs["pred"].astype('category')
      #Do cluster refinement(optional)
      #shape="hexagon" for Visium data, "square" for ST data.
      adj_2d=spg.calculate_adj_matrix(x=x_array,y=y_array, histology=False)
      refined_pred=spg.refine(sample_id=adata.obs.index.tolist(), pred=adata.obs["pred"].tolist(), dis=adj_2d, shape="hexagon")
      adata.obs["refined_pred"]=refined_pred
      adata.obs["refined_pred"]=adata.obs["refined_pred"].astype('category')
      ARI = adjusted_rand_score(adata.obs["refined_pred"], adata.obs["original_clusters"])
      aris.append(ARI)
      print('Dataset:', dataset)
      print(ARI)
   print('Dataset:', dataset)
   print(aris)
   print(np.mean(aris))
   with open('spagcn_aris.txt', 'a+') as fp:
      fp.write('mVC ')
      fp.write(' '.join([str(i) for i in aris]))
      fp.write('\n')

In [None]:
"""mPFC"""
setting_combinations = [[4, '20180417_BZ5_control'], [4, '20180419_BZ9_control'], [4, '20180424_BZ14_control']]
for setting_combi in setting_combinations:
   n_clusters = setting_combi[0]

   dataset = setting_combi[1]
   
   dir_ = './benchmarking_data/STARmap_mouse_PFC'
   adata = load_mPFC(root_dir=dir_, section_id=dataset)

   aris = []
   try:
      img=cv2.imread(os.path.join(dir_, dataset, dataset + '_full_image.tif'))
   except:
      img = None
   s=1
   b=49
   # print(adata.obsm['spatial'].shape)
   # print(adata.obs)
   x_array=adata.obs["x"].tolist()
   y_array=adata.obs["y"].tolist()
   x_pixel=x_array
   y_pixel=y_array
   if img == None:
      adj=spg.calculate_adj_matrix(x=x_pixel,y=y_pixel, x_pixel=x_pixel, y_pixel=y_pixel, image=img, beta=b, alpha=s, histology=False)
   else:
      adj=spg.calculate_adj_matrix(x=x_pixel,y=y_pixel, x_pixel=x_pixel, y_pixel=y_pixel, image=img, beta=b, alpha=s, histology=True)
   spg.prefilter_genes(adata,min_cells=3) # avoiding all genes are zeros
   spg.prefilter_specialgenes(adata)
   #Normalize and take log for UMI
   sc.pp.normalize_per_cell(adata)
   sc.pp.log1p(adata)
   p=0.5 
   #Find the l value given p
   l=spg.search_l(p, adj, start=0.01, end=1000, tol=0.01, max_run=100)
   #Set seed
   r_seed=t_seed=n_seed=100
   #Seaech for suitable resolution
   res=spg.search_res(adata, adj, l, n_clusters, start=0.7, step=0.1, tol=5e-3, lr=0.05, max_epochs=20, r_seed=r_seed, t_seed=t_seed, n_seed=n_seed)
   for iter in range(iters):
      clf=spg.SpaGCN()
      clf.set_l(l)
      #Set seed
      random.seed(r_seed)
      torch.manual_seed(t_seed)
      np.random.seed(n_seed)
      #Run
      clf.train(adata,adj,init_spa=True,init="louvain",res=res, tol=5e-3, lr=0.05, max_epochs=200)
      y_pred, prob=clf.predict()
      adata.obs["pred"]= y_pred
      adata.obs["pred"]=adata.obs["pred"].astype('category')
      #Do cluster refinement(optional)
      #shape="hexagon" for Visium data, "square" for ST data.
      adj_2d=spg.calculate_adj_matrix(x=x_array,y=y_array, histology=False)
      refined_pred=spg.refine(sample_id=adata.obs.index.tolist(), pred=adata.obs["pred"].tolist(), dis=adj_2d, shape="hexagon")
      adata.obs["refined_pred"]=refined_pred
      adata.obs["refined_pred"]=adata.obs["refined_pred"].astype('category')
      ARI = adjusted_rand_score(adata.obs["refined_pred"], adata.obs["original_clusters"])
      aris.append(ARI)
      print('Dataset:', dataset)
      print(ARI)
   print('Dataset:', dataset)
   print(aris)
   print(np.mean(aris))
   with open('spagcn_aris.txt', 'a+') as fp:
      fp.write('mPFC' + dataset + ' ')
      fp.write(' '.join([str(i) for i in aris]))
      fp.write('\n')

### 4. mHypothalamus dataset (6 slides)

In [None]:
"""mHypo"""
setting_combinations = [[8, '-0.04'], [8, '-0.09'], [8, '-0.14'], [8, '-0.19'], [8, '-0.24'], [8, '-0.29']]
for setting_combi in setting_combinations:
   n_clusters = setting_combi[0]  # 7

   dataset = setting_combi[1]  #
   
   dir_ = './benchmarking_data/mHypothalamus'
   adata = load_mHypothalamus(root_dir=dir_, section_id=dataset)

   aris = []
   try:
      img=cv2.imread(os.path.join(dir_, dataset, dataset + '_full_image.tif'))
   except:
      img = None
   s=1
   b=49
   print(adata.obs)
   x_array=adata.obs["x"].tolist()
   y_array=adata.obs["y"].tolist()
   x_pixel=x_array
   y_pixel=y_array
   if img == None:
      adj=spg.calculate_adj_matrix(x=x_pixel,y=y_pixel, x_pixel=x_pixel, y_pixel=y_pixel, image=img, beta=b, alpha=s, histology=False)
   else:
      adj=spg.calculate_adj_matrix(x=x_pixel,y=y_pixel, x_pixel=x_pixel, y_pixel=y_pixel, image=img, beta=b, alpha=s, histology=True)
   spg.prefilter_genes(adata,min_cells=3) # avoiding all genes are zeros
   spg.prefilter_specialgenes(adata)
   #Normalize and take log for UMI
   sc.pp.normalize_per_cell(adata)
   sc.pp.log1p(adata)
   p=0.5 
   #Find the l value given p
   l=spg.search_l(p, adj, start=0.01, end=1000, tol=0.01, max_run=100)
   #Set seed
   r_seed=t_seed=n_seed=100
   #Seaech for suitable resolution
   res=spg.search_res(adata, adj, l, n_clusters, start=0.7, step=0.1, tol=5e-3, lr=0.05, max_epochs=20, r_seed=r_seed, t_seed=t_seed, n_seed=n_seed)
   for iter in range(iters):
      clf=spg.SpaGCN()
      clf.set_l(l)
      #Set seed
      random.seed(r_seed)
      torch.manual_seed(t_seed)
      np.random.seed(n_seed)
      #Run
      clf.train(adata,adj,init_spa=True,init="louvain",res=res, tol=5e-3, lr=0.05, max_epochs=200)
      y_pred, prob=clf.predict()
      adata.obs["pred"]= y_pred
      adata.obs["pred"]=adata.obs["pred"].astype('category')
      #Do cluster refinement(optional)
      #shape="hexagon" for Visium data, "square" for ST data.
      adj_2d=spg.calculate_adj_matrix(x=x_array,y=y_array, histology=False)
      refined_pred=spg.refine(sample_id=adata.obs.index.tolist(), pred=adata.obs["pred"].tolist(), dis=adj_2d, shape="hexagon")
      adata.obs["refined_pred"]=refined_pred
      adata.obs["refined_pred"]=adata.obs["refined_pred"].astype('category')
      ARI = adjusted_rand_score(adata.obs["refined_pred"], adata.obs["original_clusters"])
      aris.append(ARI)
      print('Dataset:', dataset)
      print(ARI)
   print('Dataset:', dataset)
   print(aris)
   print(np.mean(aris))
   with open('spagcn_aris.txt', 'a+') as fp:
      fp.write('mHypothalamus' + dataset + ' ')
      fp.write(' '.join([str(i) for i in aris]))
      fp.write('\n')

### 5. Her2Tumor dataset (8 slides)

In [None]:
"""Her2"""
setting_combinations = [[6, 'A1'], [5, 'B1'], [4, 'C1'], [4, 'D1'], [4, 'E1'], [4, 'F1'], [7, 'G2'], [7, 'H1']]
for setting_combi in setting_combinations:
   n_clusters = setting_combi[0]

   dataset = setting_combi[1]
   
   dir_ = './benchmarking_data/Her2_tumor'
   adata = load_her2_tumor(root_dir=dir_, section_id=dataset)

   aris = []
   try:
      img=cv2.imread(os.path.join(dir_, dataset, dataset + '_full_image.tif'))
   except:
      img = None
   s=1
   b=49
   print(adata.X)
   adata.X = adata.X.astype('float')
   x_array=adata.obs["x"].tolist()
   y_array=adata.obs["y"].tolist()
   x_pixel=adata.obs["pixel_x"].tolist()
   y_pixel=adata.obs["pixel_y"].tolist()
   if img == None:
      adj=spg.calculate_adj_matrix(x=x_pixel,y=y_pixel, x_pixel=x_pixel, y_pixel=y_pixel, image=img, beta=b, alpha=s, histology=False)
   else:
      adj=spg.calculate_adj_matrix(x=x_pixel,y=y_pixel, x_pixel=x_pixel, y_pixel=y_pixel, image=img, beta=b, alpha=s, histology=True)
   spg.prefilter_genes(adata,min_cells=3) # avoiding all genes are zeros
   spg.prefilter_specialgenes(adata)
   #Normalize and take log for UMI
   sc.pp.normalize_per_cell(adata)
   sc.pp.log1p(adata)
   p=0.5 
   #Find the l value given p
   l=spg.search_l(p, adj, start=0.01, end=1000, tol=0.01, max_run=100)
   #Set seed
   r_seed=t_seed=n_seed=100
   #Seaech for suitable resolution
   res=spg.search_res(adata, adj, l, n_clusters, start=0.7, step=0.1, tol=5e-3, lr=0.05, max_epochs=20, r_seed=r_seed, t_seed=t_seed, n_seed=n_seed)
   for iter in range(iters):
      clf=spg.SpaGCN()
      clf.set_l(l)
      #Set seed
      random.seed(r_seed)
      torch.manual_seed(t_seed)
      np.random.seed(n_seed)
      #Run
      clf.train(adata,adj,init_spa=True,init="louvain",res=res, tol=5e-3, lr=0.05, max_epochs=200)
      y_pred, prob=clf.predict()
      adata.obs["pred"]= y_pred
      adata.obs["pred"]=adata.obs["pred"].astype('category')
      #Do cluster refinement(optional)
      #shape="hexagon" for Visium data, "square" for ST data.
      adj_2d=spg.calculate_adj_matrix(x=x_array,y=y_array, histology=False)
      refined_pred=spg.refine(sample_id=adata.obs.index.tolist(), pred=adata.obs["pred"].tolist(), dis=adj_2d, shape="hexagon")
      adata.obs["refined_pred"]=refined_pred
      adata.obs["refined_pred"]=adata.obs["refined_pred"].astype('category')
      ARI = adjusted_rand_score(adata.obs["refined_pred"], adata.obs["original_clusters"])
      aris.append(ARI)
      print('Dataset:', dataset)
      print(ARI)
   print('Dataset:', dataset)
   print(aris)
   print(np.mean(aris))
   with open('spagcn_aris.txt', 'a+') as fp:
      fp.write('Her2tumor' + dataset + ' ')
      fp.write(' '.join([str(i) for i in aris]))
      fp.write('\n')