On max170.

In [1]:
import scanpy as sc
import anndata as ad
import scib
import numpy as np
import pandas as pd

In [2]:
%run ./custom_silhouette_functions.ipynb

[0;31mSignature:[0m
[0msilhouette_samples_custom[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mX[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlabels[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmetric[0m[0;34m=[0m[0;34m'euclidean'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbetween_cluster_distances[0m[0;34m=[0m[0;34m'nearest'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Compute the average silhouette score for the dataset X with the given labels.

Parameters:
X : array-like, shape (n_samples, n_features)
    Feature array.
labels : array-like, shape (n_samples,)
    Labels of each point.
    
metric : metric for distance calculation, default:"euclidean", alternatives, e.g., "cosine"

between_cluster_distances: one out of "mean_other", "furthest", "nearest"


Returns:
score : float
    The average silhouette score.
[0;31mFile:[0m      /tmp/7409106.1.all.q/ipykernel_2374851/4094074416.py
[0;31mType:[0m      functio

In [3]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=ad.OldFormatWarning)

In [None]:
scenarios = ['Liam_x1', 'Liam_x5', 'naiveIntegration', 'noIntegration']

In [None]:
np.random.seed(61)

# Collect computed scores, nested dict is simple to convert to pd.DataFrame
score_dict = {}
for scenario in scenarios:
    # Initialize nested dict
    score_dict[scenario] = {}
    
    adata = ad.read_h5ad('embeddings/{}.embedding.h5ad'.format(scenario))
    adata.obsm['embedding'] = adata.X
    
    sc.pp.neighbors(adata, use_rep='embedding')

    # Compute scores
    ## Level of evaluation: batch/sample
    ### asw_batch
    score = scib.me.silhouette_batch(
        adata,
        batch_key='batch',
        group_key='cell_type',
        embed='embedding',
        verbose=False
    )
    score_dict[scenario]['asw_batch'] = score
    
    score = scib.me.silhouette_batch(
        adata,
        batch_key='batch',
        group_key='cell_type',
        embed='embedding',
        metric='cosine',
        verbose=False
    )
    score_dict[scenario]['asw_batch_cosine'] = score
    
    
    ### asw_batch_mean_other
    score = silhouette_batch_custom(
        adata,
        batch_key='batch',
        group_key='cell_type',
        embed='embedding',
        between_cluster_distances='mean_other',
        verbose=False
    )
    score_dict[scenario]['asw_batch_mean_other'] = score
    
    score = silhouette_batch_custom(
        adata,
        batch_key='batch',
        group_key='cell_type',
        embed='embedding',
        between_cluster_distances='mean_other',
        metric='cosine',
        verbose=False
    )
    score_dict[scenario]['asw_batch_mean_other_cosine'] = score
    
    ### asw_batch_furthest
    score = silhouette_batch_custom(
        adata,
        batch_key='batch',
        group_key='cell_type',
        embed='embedding',
        between_cluster_distances='furthest',
        verbose=False
    )
    score_dict[scenario]['asw_batch_furthest'] = score
    
    score = silhouette_batch_custom(
        adata,
        batch_key='batch',
        group_key='cell_type',
        embed='embedding',
        between_cluster_distances='furthest',
        metric='cosine',
        verbose=False
    )
    score_dict[scenario]['asw_batch_furthest_cosine'] = score
    
    ### graph iLISI and cLISI on variable batch
    score_dict[scenario]['iLISI_batch'], score_dict[scenario]['cLISI_full'] =  scib.me.lisi.lisi_graph(adata, batch_key='batch', label_key='cell_type', type_='knn')
        
    means = []
    total = 0
    for cell_type in adata.obs['cell_type'].unique():
        tmp_adata = adata[adata.obs['cell_type']==cell_type]
        cell_type_iLISI = scib.metrics.ilisi_graph(tmp_adata, batch_key='batch', type_='knn')
        means += [cell_type_iLISI * tmp_adata.shape[0]]
        total += tmp_adata.shape[0]
        print(cell_type, cell_type_iLISI)
    print(means)
    print(np.nansum(means)/total)
    score_dict[scenario]['CiLISI_batch'] = np.nansum(means)/total
    
    ### asw_label
    score = scib.me.silhouette(
        adata,
        group_key='cell_type',
        embed='embedding',
    )
    score_dict[scenario]['asw_label'] = score
    
    score = scib.me.silhouette(
        adata,
        group_key='cell_type',
        embed='embedding',
        metric='cosine'
    )
    score_dict[scenario]['asw_label_cosine'] = score
    
    
    ### nmi    
    scib.metrics.cluster_optimal_resolution(
        adata,
        label_key='cell_type',
        cluster_key='cluster',
        metric=scib.me.nmi
    )
    
    score = scib.me.nmi(
        adata,
        group1='cluster',
        group2='cell_type'
    )
    
    score_dict[scenario]['nmi'] = score
    
    ### ari
    scib.metrics.cluster_optimal_resolution(
        adata,
        label_key='cell_type',
        cluster_key='cluster',
        metric=scib.me.ari
    )
    
    score = scib.me.ari(adata, cluster_key="cluster", label_key="cell_type")
    score_dict[scenario]['ari'] = score

In [None]:
scores = pd.DataFrame(score_dict)

In [None]:
scores

In [None]:
pd.DataFrame(score_dict).to_csv("evaluation/batch_removal_scores_real_data.csv", index=True)