In [1]:
import anndata as ad
import scanpy as sc
import gc
import sys
import cellanova as cnova
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sea

In [6]:
def get_mean_std(adata, batch_key):
    if np.max(adata.X) > 15:
        sc.pp.filter_cells(adata, min_genes=300)
        sc.pp.filter_genes(adata, min_cells=10)

        sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
        sc.pp.log1p(adata)

    if adata.shape[1] > 3000:
        sc.pp.highly_variable_genes(adata, n_top_genes=3000, batch_key=batch_key)
        adata = adata[:, adata.var["highly_variable"]].copy()
    else:
        sc.pp.highly_variable_genes(adata, n_top_genes=adata.shape[1], batch_key=batch_key)

    if not isinstance(adata.X, np.ndarray):
        adata.X = adata.X.toarray()
    
    print(adata.X.shape)
    mean=np.mean(adata.X, axis=0, keepdims=True)
    std=np.std(adata.X, axis=0, keepdims=True)
    return mean,std

In [7]:
adata = sc.read_h5ad('../case_control/data/t1d.h5ad')
mean,std=get_mean_std(adata=adata, batch_key="donor_id")
print(mean,std)

(69645, 3000)
[[0.01114433 0.00537084 0.06579028 ... 0.00249119 0.00623902 0.4020955 ]] [[0.1118257  0.07038265 0.26703846 ... 0.05428093 0.06962858 0.66590923]]


In [11]:
def calculate_rowwise_correlation_scaled(adata1, adata2, std, batch_key="batch_all_with_condition"):
    # Ensure the obs index and batch_key match
    # assert np.sum(adata1.obs["batch_all_with_condition"]!=adata2.obs["batch_all_with_condition"])==0, "obs indices do not match between the two AnnData objects"
    assert batch_key in adata1.obs.columns, f"{batch_key} not found in adata1.obs"
    assert batch_key in adata2.obs.columns, f"{batch_key} not found in adata2.obs"

    results = []

    # Iterate through unique batches
    unique_batches = adata1.obs[batch_key].unique()
    for batch in unique_batches:
        # Subset the data for the current batch
        batch_mask = adata1.obs[batch_key] == batch
        data1 = adata1[batch_mask].X
        data2 = adata2[batch_mask].X

        data1=data1*std
        data2=data2*std

        barcodes = adata1[batch_mask].obs_names.tolist()

        # Ensure the data is in dense format if sparse
        if not isinstance(data1, np.ndarray):
            data1 = data1.toarray()
        if not isinstance(data2, np.ndarray):
            data2 = data2.toarray()

        # Compute correlation for each row
        for i in range(data1.shape[0]):
            row_corr = np.corrcoef(data1[i, :], data2[i, :])[0, 1]
            mse = np.mean(np.square(data1[i, :] - data2[i, :]))
            results.append({"correlation": row_corr, batch_key: batch, "barcode": barcodes[i], "mse": mse})

    # Convert results to DataFrame
    result_df = pd.DataFrame(results)
    return result_df

In [12]:
def evaluate_cellanova_mse(adata, batch_key, condition_key, dataset_name, std):
    adata.raw=None
    print("adata preprocessing...")

    import warnings
    warnings.filterwarnings("ignore")
    warnings.filterwarnings("ignore", category=FutureWarning)

    if isinstance(batch_key, str):
       batch_key = [batch_key]

    batch_all = []
    for i in range(adata.shape[0]):
        tmp = "__".join([adata.obs[batch_keyj][i] for batch_keyj in batch_key])
        batch_all.append(tmp)
    batch_all = np.array(batch_all)
    adata.obs["batch_all"]=batch_all
    adata.obs["batch_all"] = adata.obs["batch_all"].astype("category")
    print("batch_all", np.unique(batch_all))

    batch_all_with_condition = []
    for i in range(adata.shape[0]):
        tmp = "__".join([adata.obs[batch_keyj][i] for batch_keyj in batch_key])
        tmp = tmp + "__" + adata.obs[condition_key][i]
        batch_all_with_condition.append(tmp)
    batch_all_with_condition = np.array(batch_all_with_condition)
    adata.obs["batch_all_with_condition"] = batch_all_with_condition
    adata.obs["batch_all_with_condition"] = adata.obs["batch_all_with_condition"].astype("category")
    print("batch_all_with_condition", np.unique(batch_all_with_condition))

    batch_key.append("batch_all")
    batch_key.append("batch_all_with_condition")
    print("Finish preprocessing")

    main_effect_adata = ad.AnnData(adata.layers['main_effect'], dtype=np.float32)
    main_effect_adata.var_names = adata.var_names
    main_effect_adata.obs = adata.obs.copy()

    integrated = ad.AnnData(adata.layers['denoised'], dtype=np.float32)
    integrated.obs = adata.obs.copy()
    integrated.var_names = adata.var_names

    print("Calculating global distortion...")
    df_global_correlation=calculate_rowwise_correlation_scaled(adata, integrated, std)
    df_global_correlation.to_csv("./cellanova/"+dataset_name+"_global_correlation_scaled.csv")
    print("Finish")

In [13]:
adata=sc.read_h5ad("./cellanova/t1d_results.h5ad")
evaluate_cellanova_mse(adata=adata,batch_key="donor_id",condition_key="disease_state",dataset_name="t1d", std=std)

adata preprocessing...
batch_all ['HPAP019' 'HPAP020' 'HPAP021' 'HPAP022' 'HPAP023' 'HPAP024' 'HPAP026'
 'HPAP028' 'HPAP029' 'HPAP032' 'HPAP034' 'HPAP035' 'HPAP036' 'HPAP037'
 'HPAP038' 'HPAP039' 'HPAP040' 'HPAP042' 'HPAP043' 'HPAP044' 'HPAP045'
 'HPAP047' 'HPAP049' 'HPAP050']
batch_all_with_condition ['HPAP019__AAB' 'HPAP020__T1D' 'HPAP021__T1D' 'HPAP022__Control'
 'HPAP023__T1D' 'HPAP024__AAB' 'HPAP026__Control' 'HPAP028__T1D'
 'HPAP029__AAB' 'HPAP032__T1D' 'HPAP034__Control' 'HPAP035__Control'
 'HPAP036__Control' 'HPAP037__Control' 'HPAP038__AAB' 'HPAP039__Control'
 'HPAP040__Control' 'HPAP042__Control' 'HPAP043__AAB' 'HPAP044__Control'
 'HPAP045__AAB' 'HPAP047__Control' 'HPAP049__AAB' 'HPAP050__AAB']
Finish preprocessing
Calculating global distortion...
Finish


# Kidney dataset

In [14]:
adata=sc.read_h5ad("../case_control/data/GSE211785_Susztak_SC_SN_ATAC_merged_PreSCVI_final.h5ad")
adata=adata[adata.obs["tech"].isin(['SC_RNA','SN_RNA'])].copy()
mean,std=get_mean_std(adata=adata, batch_key="orig_ident")
print(mean,std)

adata=sc.read_h5ad("./cellanova/kidney_results.h5ad")
evaluate_cellanova_mse(adata=adata,batch_key="orig_ident",condition_key="Status",dataset_name="kidney", std=std)

(282610, 3000)
[[0.08848958 0.18313816 0.0182847  ... 0.22860017 0.03765563 0.01596534]] [[0.38014755 0.64402455 0.19035025 ... 0.68938184 0.2806099  0.17945607]]
adata preprocessing...
batch_all ['HK1584.SC' 'HK1585.SC' 'HK1588.SC' 'HK1593.SC' 'HK1594.SC' 'HK1597.SC'
 'HK1632.SC' 'HK1634.SC' 'HK1635_G.SC' 'HK1635_T.SC' 'HK1650_G.SC'
 'HK1650_T.SC' 'HK1664_G.SC' 'HK1664_T.SC' 'HK1770.SC' 'HK1997.SC'
 'HK2558.SN' 'HK2596.SN' 'HK2662.SC' 'HK2663_SN' 'HK2711.SN' 'HK2713.SC'
 'HK2739.SN' 'HK2770.SC' 'HK2770_SN' 'HK2774.SC' 'HK2774.SN' 'HK2833.SC'
 'HK2833.SN' 'HK2844.SN' 'HK2862.SN' 'HK2867.SC' 'HK2867.SN' 'HK2868.SN'
 'HK2891.SC' 'HK2891.SN' 'HK2893.SC' 'HK2893.SN' 'HK2895.SC' 'HK2895.SN'
 'HK2896.SC' 'HK2898.SN' 'HK2899.SC' 'HK2899.SN' 'HK2923.SC' 'HK2924.SC'
 'HK2976.SN']
batch_all_with_condition ['HK1584.SC__Disease' 'HK1585.SC__Control' 'HK1588.SC__Control'
 'HK1593.SC__Disease' 'HK1594.SC__Disease' 'HK1597.SC__Disease'
 'HK1632.SC__Disease' 'HK1634.SC__Disease' 'HK1635_G.SC__Disease'

# Mouse radiation experiment dataset

https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE280883

In [None]:
adata=sc.read_h5ad("../case_control/data/mouse.h5ad")
mean,std=get_mean_std(adata=adata, batch_key='real_batch')
print(mean,std)

adata=sc.read_h5ad("./cellanova/kidney_results.h5ad")
evaluate_cellanova_mse(adata=adata,batch_key=['sample','tech'],condition_key="Status",dataset_name="kidney", std=std)

# SEA-AD dataset

In [None]:
adata=sc.read_h5ad("../case_control/data/AD.h5ad")
mean,std=get_mean_std(adata=adata, batch_key='donor_id')
print(mean,std)

adata=sc.read_h5ad("./cellanova/AD_results.h5ad")
adata.raw=None

evaluate_cellanova_mse(adata=adata,batch_key=['donor_id'],condition_key="disease",dataset_name="AD", std=std)