In [2]:
import pandas as pd
import scanpy as sc
import scib

In [3]:
base_dir = "/Users/putri.g/Documents/GitHub/SuperCellCyto-analysis/output/trussart_cytofruv/20240111"

In [4]:
uncorrected_adata = sc.read_h5ad(base_dir + "/supercells_uncorrected_clustered.h5ad")
cycombine_adata = sc.read_h5ad(base_dir + "/supercells_cycombine_clustered.h5ad")
cytofruv_adata = sc.read_h5ad(base_dir + "/supercells_cytofruv_clustered.h5ad")

In [5]:
uncorrected_adata

AnnData object with n_obs × n_vars = 429488 × 31
    obs: 'sample_id', 'condition', 'patient_id', 'batch', 'FlowSOM_cluster', 'FlowSOM_metacluster'
    var: 'fcs_colname', 'antigen', 'marker_class'
    uns: 'neighbors', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [6]:
cycombine_adata

AnnData object with n_obs × n_vars = 429488 × 31
    obs: 'sample', 'condition', 'patient_id', 'batch', 'FlowSOM_cluster', 'FlowSOM_metacluster'
    var: 'fcs_colname', 'antigen', 'marker_class'
    uns: 'neighbors', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [7]:
cytofruv_adata

AnnData object with n_obs × n_vars = 429488 × 31
    obs: 'sample_id', 'condition', 'patient_id', 'batch', 'FlowSOM_cluster', 'FlowSOM_metacluster'
    var: 'fcs_colname', 'antigen', 'marker_class'
    uns: 'neighbors', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

# Actual assessment

In [8]:
metrics_scores = {
    "uncorrected": {},
    "cycombine": {},
    "cytofruv": {}
}

In [9]:
metrics_scores

{'uncorrected': {}, 'cycombine': {}, 'cytofruv': {}}

In [10]:
metrics_scores['uncorrected']['silhouette'] = scib.metrics.silhouette(uncorrected_adata, label_key="FlowSOM_metacluster", embed="X_pca")
metrics_scores['cytofruv']['silhouette'] = scib.metrics.silhouette(cytofruv_adata, label_key="FlowSOM_metacluster", embed="X_pca")
metrics_scores['cycombine']['silhouette'] = scib.metrics.silhouette(cycombine_adata, label_key="FlowSOM_metacluster", embed="X_pca")
metrics_scores

{'uncorrected': {'silhouette': 0.6029170677065849},
 'cycombine': {'silhouette': 0.6218346208333969},
 'cytofruv': {'silhouette': 0.6750191301107407}}

Actually, we can use the re-clustered data (post-correction) to calculate NMI and ARI.
So, if we use the clustered data prior to correction as the label key, and the re-clustered data as the cluster key, we can compute NMI and ARI...

In [11]:
# Make sure this print nothing before running the cell below!
for i in range(uncorrected_adata.shape[0]):
    if uncorrected_adata.obs.index[i] != cytofruv_adata.obs.index[i]:
        print(i)
    if uncorrected_adata.obs.index[i] != cycombine_adata.obs.index[i]:
        print(i)

In [12]:
cytofruv_adata.obs["FlowSOM_metacluster_from_uncorrected"] = uncorrected_adata.obs["FlowSOM_metacluster"]
cycombine_adata.obs["FlowSOM_metacluster_from_uncorrected"] = uncorrected_adata.obs["FlowSOM_metacluster"]

In [13]:
metrics_scores['cytofruv']['NMI'] = scib.metrics.nmi(cytofruv_adata, label_key="FlowSOM_metacluster_from_uncorrected", cluster_key="FlowSOM_metacluster")
metrics_scores['cycombine']['NMI'] = scib.metrics.nmi(cycombine_adata, label_key="FlowSOM_metacluster_from_uncorrected", cluster_key="FlowSOM_metacluster")
metrics_scores

{'uncorrected': {'silhouette': 0.6029170677065849},
 'cycombine': {'silhouette': 0.6218346208333969, 'NMI': 0.8265218357083224},
 'cytofruv': {'silhouette': 0.6750191301107407, 'NMI': 0.7769091032573271}}

In [14]:
metrics_scores['cytofruv']['ARI'] = scib.metrics.ari(cytofruv_adata, label_key="FlowSOM_metacluster_from_uncorrected", cluster_key="FlowSOM_metacluster", implementation="sklearn")
metrics_scores['cycombine']['ARI'] = scib.metrics.ari(cycombine_adata, label_key="FlowSOM_metacluster_from_uncorrected", cluster_key="FlowSOM_metacluster", implementation="sklearn")
metrics_scores

{'uncorrected': {'silhouette': 0.6029170677065849},
 'cycombine': {'silhouette': 0.6218346208333969,
  'NMI': 0.8265218357083224,
  'ARI': 0.8989028785435254},
 'cytofruv': {'silhouette': 0.6750191301107407,
  'NMI': 0.7769091032573271,
  'ARI': 0.8721639323250747}}

In [15]:
pd.DataFrame(metrics_scores).to_csv(base_dir + "/bio_conservation_metrics.csv")

Isolated labels F1 scores take too long to measure.. perhaps not the best for cytometry data.