# Download CellphoneDB

In [2]:
from IPython.display import HTML, display
from cellphonedb.utils import db_releases_utils

display(HTML(db_releases_utils.get_remote_database_versions_html()['db_releases_html_table']))

Version,Release date
v4.1.0,2023-03-09
,
,
,


In [4]:
import os

# -- Version of the databse
cpdb_version = 'v5.0.0'

# -- Path where the input files to generate the database are located
cpdb_target_dir = os.path.join('/gpfs/gibbs/project/wang_zuoheng/xx244/GITIII_benchmark/CellphoneDB/database/', cpdb_version)

In [5]:
from cellphonedb.utils import db_utils

db_utils.download_database(cpdb_target_dir, cpdb_version)

Downloaded cellphonedb.zip into /gpfs/gibbs/project/wang_zuoheng/xx244/GITIII_benchmark/CellphoneDB/database/v5.0.0
Downloaded complex_input.csv into /gpfs/gibbs/project/wang_zuoheng/xx244/GITIII_benchmark/CellphoneDB/database/v5.0.0
Downloaded gene_input.csv into /gpfs/gibbs/project/wang_zuoheng/xx244/GITIII_benchmark/CellphoneDB/database/v5.0.0
Downloaded interaction_input.csv into /gpfs/gibbs/project/wang_zuoheng/xx244/GITIII_benchmark/CellphoneDB/database/v5.0.0
Downloaded protein_input.csv into /gpfs/gibbs/project/wang_zuoheng/xx244/GITIII_benchmark/CellphoneDB/database/v5.0.0
Downloaded uniprot_synonyms.tsv into /gpfs/gibbs/project/wang_zuoheng/xx244/GITIII_benchmark/CellphoneDB/database/v5.0.0/sources
Downloaded transcription_factor_input.csv into /gpfs/gibbs/project/wang_zuoheng/xx244/GITIII_benchmark/CellphoneDB/database/v5.0.0/sources


In [6]:
cpdb_file_path = '/gpfs/gibbs/project/wang_zuoheng/xx244/GITIII_benchmark/CellphoneDB/database/v5.0.0/cellphonedb.zip'

# Mouse brain dataset

CellPhoneDB is for human, not for mouse.

https://github.com/ventolab/CellphoneDB

CellphoneDB is a publicly available repository of HUMAN curated receptors, ligands and their interactions paired with a tool to interrogate your own single-cell transcriptomics data.

# AD dataset

In [16]:
import pandas as pd
import torch
import scanpy as sc
import anndata as ad
import numpy as np

In [17]:
df=pd.read_csv("./data/AD/AD.csv")
df=df[df["section"]=="H20.33.001.CX28.MTG.02.007.1.02.03"].copy()
genes=torch.load("./data/AD/genes.pth")
adata=ad.AnnData(X=df[genes].values)
adata.obs["centerx"]=df["centerx"].values
adata.obs["centery"]=df["centery"].values
adata.obsm["spatial"]=np.stack([df["centerx"].values,df["centery"].values],axis=-1)
adata.var_names=genes
print(adata)

AnnData object with n_obs × n_vars = 15225 × 140
    obs: 'centerx', 'centery'
    obsm: 'spatial'


  genes=torch.load("./data/AD/genes.pth")


In [18]:
sc.pp.normalize_total(adata, inplace=True)
sc.pp.log1p(adata)
adata.write_h5ad("./tmp/AD.h5ad")

In [19]:
barcodes=adata.obs_names
cell_types=df["subclass"].values

# Create a DataFrame
df = pd.DataFrame({"barcode_sample": barcodes, "cell_type": cell_types})
# Write to a .tsv file
df.to_csv("./tmp/AD_meta.tsv", sep="\t", index=False)

In [20]:
from cellphonedb.src.core.methods import cpdb_statistical_analysis_method

meta_file_path = "./tmp/AD_meta.tsv"
counts_file_path = "./tmp/AD.h5ad"
out_path = '/gpfs/gibbs/project/wang_zuoheng/xx244/GITIII_benchmark/CellphoneDB/AD/'

cpdb_results = cpdb_statistical_analysis_method.call(
    cpdb_file_path = cpdb_file_path,                 # mandatory: CellphoneDB database zip file.
    meta_file_path = meta_file_path,                 # mandatory: tsv file defining barcodes to cell label.
    counts_file_path = counts_file_path,             # mandatory: normalized count matrix - a path to the counts file, or an in-memory AnnData object
    counts_data = 'hgnc_symbol',                     # defines the gene annotation in counts matrix.
    score_interactions = True,                       # optional: whether to score interactions or not. 
    iterations = 1000,                               # denotes the number of shufflings performed in the analysis.
    threshold = 0.1,                                 # defines the min % of cells expressing a gene for this to be employed in the analysis.
    threads = 5,                                     # number of threads to use in the analysis.
    debug_seed = 42,                                 # debug randome seed. To disable >=0.
    result_precision = 3,                            # Sets the rounding for the mean values in significan_means.
    pvalue = 0.05,                                   # P-value threshold to employ for significance.
    subsampling = False,                             # To enable subsampling the data (geometri sketching).
    subsampling_log = False,                         # (mandatory) enable subsampling log1p for non log-transformed data inputs.
    subsampling_num_pc = 100,                        # Number of componets to subsample via geometric skectching (dafault: 100).
    subsampling_num_cells = 1000,                    # Number of cells to subsample (integer) (default: 1/3 of the dataset).
    separator = '|',                                 # Sets the string to employ to separate cells in the results dataframes "cellA|CellB".
    debug = False,                                   # Saves all intermediate tables employed during the analysis in pkl format.
    output_path = out_path,                          # Path to save results.
    output_suffix = None                             # Replaces the timestamp in the output files by a user defined string in the  (default: None).
    )

Reading user files...
The following user files were loaded successfully:
./tmp/AD.h5ad
./tmp/AD_meta.tsv
[ ][CORE][21/02/25-17:07:54][INFO] [Cluster Statistical Analysis] Threshold:0.1 Iterations:1000 Debug-seed:42 Threads:5 Precision:3
[ ][CORE][21/02/25-17:07:56][INFO] Running Real Analysis
[ ][CORE][21/02/25-17:07:56][INFO] Running Statistical Analysis


100%|██████████| 1000/1000 [00:36<00:00, 27.72it/s]

[ ][CORE][21/02/25-17:08:32][INFO] Building Pvalues result
[ ][CORE][21/02/25-17:08:32][INFO] Building results
[ ][CORE][21/02/25-17:08:32][INFO] Scoring interactions: Filtering genes per cell type..



100%|██████████| 24/24 [00:00<00:00, 204.02it/s]

[ ][CORE][21/02/25-17:08:32][INFO] Scoring interactions: Calculating mean expression of each gene per group/cell type..




00%|██████████| 24/24 [00:00<00:00, 551.80it/s]

[ ][CORE][21/02/25-17:08:32][INFO] Scoring interactions: Calculating scores for all interactions and cell types..



00%|██████████| 576/576 [00:00<00:00, 711.99it/s]

Saved deconvoluted to /gpfs/gibbs/project/wang_zuoheng/xx244/GITIII_benchmark/CellphoneDB/AD/statistical_analysis_deconvoluted_02_21_2025_170834.txt
Saved deconvoluted_percents to /gpfs/gibbs/project/wang_zuoheng/xx244/GITIII_benchmark/CellphoneDB/AD/statistical_analysis_deconvoluted_percents_02_21_2025_170834.txt
Saved means to /gpfs/gibbs/project/wang_zuoheng/xx244/GITIII_benchmark/CellphoneDB/AD/statistical_analysis_means_02_21_2025_170834.txt
Saved pvalues to /gpfs/gibbs/project/wang_zuoheng/xx244/GITIII_benchmark/CellphoneDB/AD/statistical_analysis_pvalues_02_21_2025_170834.txt
Saved significant_means to /gpfs/gibbs/project/wang_zuoheng/xx244/GITIII_benchmark/CellphoneDB/AD/statistical_analysis_significant_means_02_21_2025_170834.txt
Saved interaction_scores to /gpfs/gibbs/project/wang_zuoheng/xx244/GITIII_benchmark/CellphoneDB/AD/statistical_analysis_interaction_scores_02_21_2025_170834.txt


In [23]:
cpdb_results['pvalues'].head(5)
torch.save(cpdb_results,"./CellphoneDB/AD.pth")

# NSCLC

In [1]:
import pandas as pd
import numpy as np

df=pd.read_csv("./data/NSCLC/NSCLC.csv")
cell_types=df["CellType"].values
cell_types_new=[]
for i in cell_types:
    tmp=i
    if i.find("tumor")>=0:
        tmp="tumor"
    cell_types_new.append(tmp)

df.loc[:,"CellType"]=cell_types_new
df.to_csv("./data/NSCLC/NSCLC.csv")

In [24]:
df=pd.read_csv("./data/NSCLC/NSCLC.csv")
df=df[df["section"]=="Lung6"].copy()
print(df.columns)
genes=torch.load("./data/NSCLC/genes.pth")
adata=ad.AnnData(X=df[genes].values)
adata.obs["centerx"]=df['CenterX_global_px'].values
adata.obs["centery"]=df['CenterY_global_px'].values
adata.obsm["spatial"]=np.stack([df['CenterX_global_px'].values,df['CenterY_global_px'].values],axis=-1)
adata.var_names=genes
print(adata)

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.
Index(['fov', 'cell_ID', 'AATK', 'ABL1', 'ABL2', 'ACE', 'ACE2', 'ACKR1',
       'ACKR3', 'ACKR4',
       ...
       'SampleID', 'Area', 'AspectRatio', 'CenterX_local_px',
       'CenterY_local_px', 'CenterX_global_px', 'CenterY_global_px', 'Width',
       'Height', 'section'],
      dtype='object', length=974)
AnnData object with n_obs × n_vars = 89948 × 960
    obs: 'centerx', 'centery'
    obsm: 'spatial'


In [None]:
sc.pp.normalize_total(adata, inplace=True)
sc.pp.log1p(adata)
adata.write_h5ad("./tmp/NSCLC.h5ad")

barcodes=adata.obs_names
cell_types=df["CellType"].values #!!!!!!!!!!!!!!!!!!!!!!!!!

# Create a DataFrame
df = pd.DataFrame({"barcode_sample": barcodes, "cell_type": cell_types})
# Write to a .tsv file
df.to_csv("./tmp/NSCLC_meta.tsv", sep="\t", index=False)

from cellphonedb.src.core.methods import cpdb_statistical_analysis_method

meta_file_path = "./tmp/NSCLC_meta.tsv"
counts_file_path = "./tmp/NSCLC.h5ad"
out_path = '/gpfs/gibbs/project/wang_zuoheng/xx244/GITIII_benchmark/CellphoneDB/NSCLC/'

cpdb_results = cpdb_statistical_analysis_method.call(
    cpdb_file_path = cpdb_file_path,                 # mandatory: CellphoneDB database zip file.
    meta_file_path = meta_file_path,                 # mandatory: tsv file defining barcodes to cell label.
    counts_file_path = counts_file_path,             # mandatory: normalized count matrix - a path to the counts file, or an in-memory AnnData object
    counts_data = 'hgnc_symbol',                     # defines the gene annotation in counts matrix.
    score_interactions = True,                       # optional: whether to score interactions or not. 
    iterations = 1000,                               # denotes the number of shufflings performed in the analysis.
    threshold = 0.1,                                 # defines the min % of cells expressing a gene for this to be employed in the analysis.
    threads = 5,                                     # number of threads to use in the analysis.
    debug_seed = 42,                                 # debug randome seed. To disable >=0.
    result_precision = 3,                            # Sets the rounding for the mean values in significan_means.
    pvalue = 0.05,                                   # P-value threshold to employ for significance.
    subsampling = False,                             # To enable subsampling the data (geometri sketching).
    subsampling_log = False,                         # (mandatory) enable subsampling log1p for non log-transformed data inputs.
    subsampling_num_pc = 100,                        # Number of componets to subsample via geometric skectching (dafault: 100).
    subsampling_num_cells = 1000,                    # Number of cells to subsample (integer) (default: 1/3 of the dataset).
    separator = '|',                                 # Sets the string to employ to separate cells in the results dataframes "cellA|CellB".
    debug = False,                                   # Saves all intermediate tables employed during the analysis in pkl format.
    output_path = out_path,                          # Path to save results.
    output_suffix = None                             # Replaces the timestamp in the output files by a user defined string in the  (default: None).
    )

cpdb_results['pvalues'].head(5)
torch.save(cpdb_results,"./CellphoneDB/NSCLC.pth")

# BC

In [32]:
df=pd.read_csv("./data/BC/BC.csv")
df=df[df["section"]=="sample1_rep1"].copy()
print(df.columns)
genes=torch.load("./data/BC/genes.pth")
print(genes)
adata=ad.AnnData(X=df[genes].values)
adata.obs["centerx"]=df["centerx"].values
adata.obs["centery"]=df["centery"].values
adata.obsm["spatial"]=np.stack([df["centerx"].values,df["centery"].values],axis=-1)
adata.var_names=genes
print(adata)

Index(['Unnamed: 0', 'ABCC11', 'ACTA2', 'ACTG2', 'ADAM9', 'ADGRE5', 'ADH1B',
       'ADIPOQ', 'AGR3', 'AHSP',
       ...
       'antisense_TRMU', 'antisense_MYLIP', 'antisense_LGI3',
       'antisense_BCL2L15', 'antisense_ADCY4', 'centerx', 'centery',
       'subclass', 'index', 'section'],
      dtype='object', length=327)
['ABCC11', 'ACTA2', 'ACTG2', 'ADAM9', 'ADGRE5', 'ADH1B', 'ADIPOQ', 'AGR3', 'AHSP', 'AIF1', 'AKR1C1', 'AKR1C3', 'ALDH1A3', 'ANGPT2', 'ANKRD28', 'ANKRD29', 'ANKRD30A', 'APOBEC3A', 'APOBEC3B', 'APOC1', 'AQP1', 'AQP3', 'AR', 'AVPR1A', 'BACE2', 'BANK1', 'BASP1', 'BTNL9', 'C15orf48', 'C1QA', 'C1QC', 'C2orf42', 'C5orf46', 'C6orf132', 'CAV1', 'CAVIN2', 'CCDC6', 'CCDC80', 'CCL20', 'CCL5', 'CCL8', 'CCND1', 'CCPG1', 'CCR7', 'CD14', 'CD163', 'CD19', 'CD1C', 'CD247', 'CD27', 'CD274', 'CD3D', 'CD3E', 'CD3G', 'CD4', 'CD68', 'CD69', 'CD79A', 'CD79B', 'CD80', 'CD83', 'CD86', 'CD8A', 'CD8B', 'CD9', 'CD93', 'CDC42EP1', 'CDH1', 'CEACAM6', 'CEACAM8', 'CENPF', 'CLCA2', 'CLDN4', 'CLDN5', 

In [None]:
sc.pp.normalize_total(adata, inplace=True)
sc.pp.log1p(adata)
adata.write_h5ad("./tmp/BC.h5ad")

barcodes=adata.obs_names
cell_types=df["subclass"].values #!!!!!!!!!!!!!!!!!!!!!!!!!

# Create a DataFrame
df = pd.DataFrame({"barcode_sample": barcodes, "cell_type": cell_types})
# Write to a .tsv file
df.to_csv("./tmp/BC_meta.tsv", sep="\t", index=False)

from cellphonedb.src.core.methods import cpdb_statistical_analysis_method

meta_file_path = "./tmp/BC_meta.tsv"
counts_file_path = "./tmp/BC.h5ad"
out_path = '/gpfs/gibbs/project/wang_zuoheng/xx244/GITIII_benchmark/CellphoneDB/BC/'

cpdb_results = cpdb_statistical_analysis_method.call(
    cpdb_file_path = cpdb_file_path,                 # mandatory: CellphoneDB database zip file.
    meta_file_path = meta_file_path,                 # mandatory: tsv file defining barcodes to cell label.
    counts_file_path = counts_file_path,             # mandatory: normalized count matrix - a path to the counts file, or an in-memory AnnData object
    counts_data = 'hgnc_symbol',                     # defines the gene annotation in counts matrix.
    score_interactions = True,                       # optional: whether to score interactions or not. 
    iterations = 1000,                               # denotes the number of shufflings performed in the analysis.
    threshold = 0.1,                                 # defines the min % of cells expressing a gene for this to be employed in the analysis.
    threads = 5,                                     # number of threads to use in the analysis.
    debug_seed = 42,                                 # debug randome seed. To disable >=0.
    result_precision = 3,                            # Sets the rounding for the mean values in significan_means.
    pvalue = 0.05,                                   # P-value threshold to employ for significance.
    subsampling = False,                             # To enable subsampling the data (geometri sketching).
    subsampling_log = False,                         # (mandatory) enable subsampling log1p for non log-transformed data inputs.
    subsampling_num_pc = 100,                        # Number of componets to subsample via geometric skectching (dafault: 100).
    subsampling_num_cells = 1000,                    # Number of cells to subsample (integer) (default: 1/3 of the dataset).
    separator = '|',                                 # Sets the string to employ to separate cells in the results dataframes "cellA|CellB".
    debug = False,                                   # Saves all intermediate tables employed during the analysis in pkl format.
    output_path = out_path,                          # Path to save results.
    output_suffix = None                             # Replaces the timestamp in the output files by a user defined string in the  (default: None).
    )

cpdb_results['pvalues'].head(5)
torch.save(cpdb_results,"./CellphoneDB/BC.pth")