In [2]:
import numpy as np
import scanpy as sc

## Bhat-Nakshatri dataset
snRNA-seq analyses of breast tissues of healthy women of diverse genetic ancestry  
Link: https://datasets.cellxgene.cziscience.com/bc4614b7-5ab2-4f30-8e8b-eaeeb840c154.h5ad

In [2]:
# Load the .h5ad file
adata_breast= sc.read_h5ad("datasets/bc4614b7-5ab2-4f30-8e8b-eaeeb840c154.h5ad")

In [39]:
adata_breast

AnnData object with n_obs × n_vars = 51367 × 35477
    obs: 'mapped_reference_annotation', 'donor_id', 'donor_age', 'self_reported_ethnicity_ontology_term_id', 'donor_living_at_sample_collection', 'donor_menopausal_status', 'donor_times_pregnant', 'sample_uuid', 'sample_preservation_method', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'sample_derivation_process', 'menstrual_phase_at_collection', 'sample_source', 'donor_BMI_at_collection', 'tissue_type', 'suspension_derivation_process', 'suspension_uuid', 'suspension_type', 'tissue_handling_interval', 'library_uuid', 'assay_ontology_term_id', 'library_starting_quantity', 'sequencing_platform', 'is_primary_data', 'cell_type_ontology_term_id', 'author_cell_type', 'disease_ontology_term_id', 'reported_diseases', 'sex_ontology_term_id', 'celltype', 'ML_sub', 'Basal_sub', 'epi_sub', 'LP_sub', 'LHS_sub', 'LASP_sub', 'Group', 'Pool', 'nFeature_ATAC', 'nCount_RNA', 'nCount_SCT', 'nucleosome_percentile', 'nucleosome_group', 

In [44]:
df_breast = adata_breast.obs
df_breast["celltype"]

AAACAGCCAATTGAAG-1_1      Epithelial
AAACCAACACCCACCT-1_1      Adipocytes
AAACCAACAGCAAATA-1_1     Fibroblasts
AAACCAACATTAGCCA-1_1     Fibroblasts
AAACCGAAGACTTACA-1_1      Epithelial
                            ...     
TTTGTGAAGGTCCACA-1_16     Epithelial
TTTGTGAAGTCACGAT-1_16     Epithelial
TTTGTGGCAAGCGATG-1_16     Epithelial
TTTGTGGCAGCTACGT-1_16     Epithelial
TTTGTGTTCGGTCATG-1_16     Epithelial
Name: celltype, Length: 51367, dtype: category
Categories (6, object): ['Adipocytes', 'Endothelial', 'Epithelial', 'Fibroblasts', 'Macrophages', 'T-cells']

In [None]:
cell_types = list(set(df_breast["cell_type"].to_list()))
cell_types

['macrophage',
 'adipocyte',
 'luminal hormone-sensing cell of mammary gland',
 'endothelial cell',
 'T cell',
 'basal-myoepithelial cell of mammary gland',
 'luminal adaptive secretory precursor cell of mammary gland',
 'fibroblast']

In [35]:
df_breast["cell_type"].value_counts()

cell_type
basal-myoepithelial cell of mammary gland                     16184
luminal adaptive secretory precursor cell of mammary gland    11456
fibroblast                                                     6526
endothelial cell                                               5407
luminal hormone-sensing cell of mammary gland                  5202
adipocyte                                                      2712
T cell                                                         2240
macrophage                                                     1640
Name: count, dtype: int64

## snRNA-seq of human kidney with ureteral obstruction
Link: https://datasets.cellxgene.cziscience.com/70ae5e5d-5a40-4ab7-b8dd-becc5dffd4ba.h5ad  
Extracted: 15140 disease normal cells

In [3]:
# Load the .h5ad file
adata_kidney = sc.read_h5ad("datasets/70ae5e5d-5a40-4ab7-b8dd-becc5dffd4ba.h5ad")

In [11]:
adata_kidney

AnnData object with n_obs × n_vars = 46957 × 29702
    obs: 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'suspension_type', 'tissue_type', 'tissue_ontology_term_id', 'library_id', 'celltype_l1', 'celltype_l2', 'n_umi', 'n_genes', 'percent_mitochrondrial', 'cell_type', 'assay', 'disease', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'features', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    uns: 'celltype_l1_colors', 'celltype_l2_colors', 'citation', 'default_embedding', 'organism', 'organism_ontology_term_id', 'schema_reference', 'schema_version', 'title'
    obsm: 'X_umap'
    varm: 'PCs'
    obsp: 'distances'

In [10]:
df_kidney = adata_kidney.obs

In [5]:
df_kidney["disease"]

l1_AAACAGCCATCAATCG-1    obstructive nephropathy
l1_AAACAGCCATTGCAGC-1    obstructive nephropathy
l1_AAACCGAAGACAAACG-1    obstructive nephropathy
l1_AAACCGAAGTAATCCA-1    obstructive nephropathy
l1_AAACCGAAGTGTGATC-1    obstructive nephropathy
                                  ...           
l6_TTTAGCTTCGCTAGCA-1                     normal
l6_TTTCGTCCACAGGGAC-1                     normal
l6_TTTGAGTCAGGAACTG-1                     normal
l6_TTTGCATTCGCTCACT-1                     normal
l6_TTTGTGTTCATGCTTT-1                     normal
Name: disease, Length: 46957, dtype: category
Categories (2, object): ['obstructive nephropathy', 'normal']

In [6]:
(df_kidney["disease"] == "normal").sum()

np.int64(15140)

In [7]:
adata_kidney_normal = adata_kidney[adata_kidney.obs["disease"] == "normal"].copy()
df_kidney_normal = adata_kidney_normal.obs

In [None]:
cell_types = list(set(df_kidney_normal["cell_type"].to_list()))
cell_types

['kidney distal convoluted tubule epithelial cell',
 'B cell',
 'renal principal cell',
 'endothelial cell',
 'kidney loop of Henle ascending limb epithelial cell',
 'T cell',
 'interstitial cell',
 'kidney connecting tubule epithelial cell',
 'kidney loop of Henle thin descending limb epithelial cell',
 'parietal epithelial cell',
 'kidney proximal convoluted tubule epithelial cell',
 'podocyte',
 'myeloid cell',
 'kidney loop of Henle thin ascending limb epithelial cell',
 'renal intercalated cell']

In [16]:
# change this if your labels are in a different column (e.g., "celltype_l2")
col = "cell_type"

counts_dict = (
    adata_kidney_normal.obs[col]
    .value_counts()
    .reindex(cell_types, fill_value=0)
    .astype(int)
    .to_dict()
)

counts_dict

{'kidney distal convoluted tubule epithelial cell': 1817,
 'B cell': 59,
 'renal principal cell': 682,
 'endothelial cell': 691,
 'kidney loop of Henle ascending limb epithelial cell': 4806,
 'T cell': 588,
 'interstitial cell': 228,
 'kidney connecting tubule epithelial cell': 1105,
 'kidney loop of Henle thin descending limb epithelial cell': 91,
 'parietal epithelial cell': 211,
 'kidney proximal convoluted tubule epithelial cell': 3213,
 'podocyte': 35,
 'myeloid cell': 455,
 'kidney loop of Henle thin ascending limb epithelial cell': 190,
 'renal intercalated cell': 969}

## 10x scRNA-seq from human cortex  
I think scRNA-seq should be snRNA-seq here, since they say: "we simultaneously profiled gene expression and chromatin accessibility in 45,549 individual nuclei isolated from the human cortex"   
Link: https://datasets.cellxgene.cziscience.com/fe86d86c-16cc-4047-a741-d9e186b35175.h5ad  

In [14]:
# Load the .h5ad file
adata_cortex = sc.read_h5ad("datasets/fe86d86c-16cc-4047-a741-d9e186b35175.h5ad")

In [15]:
df_cortex = adata_cortex.obs

In [16]:
df_cortex

Unnamed: 0_level_0,author_cell_type,age_group,donor_id,nCount_RNA,nFeature_RNA,nCount_ATAC,nFeature_ATAC,TSS_percentile,nucleosome_signal,percent_mt,...,batch,tissue_type,cell_type,assay,disease,sex,tissue,self_reported_ethnicity,development_stage,observation_joinid
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4_AAACAGCCAACACTTG-1,EN-fetal-late,late fetal,LaFet1,3483,1685.0,4518.0,4024.0,0.31,1.103503,0.000000,...,2,tissue,glutamatergic neuron,10x multiome,normal,male,cortical plate,unknown,23rd week post-fertilization stage,t<2(H>sg#K
4_AAACAGCCACCAAAGG-1,EN-fetal-late,late fetal,LaFet1,4863,2149.0,4641.0,4128.0,0.72,1.002789,0.000000,...,2,tissue,glutamatergic neuron,10x multiome,normal,male,cortical plate,unknown,23rd week post-fertilization stage,HPU?^1i&Yf
4_AAACAGCCATAAGTTC-1,EN-fetal-late,late fetal,LaFet1,11069,3707.0,12512.0,10169.0,0.51,1.054060,0.000000,...,2,tissue,glutamatergic neuron,10x multiome,normal,male,cortical plate,unknown,23rd week post-fertilization stage,vKJ1WULiG^
4_AAACATGCATAGTCAT-1,EN-fetal-late,late fetal,LaFet1,7990,2919.0,5437.0,4844.0,0.47,1.141570,0.000000,...,2,tissue,glutamatergic neuron,10x multiome,normal,male,cortical plate,unknown,23rd week post-fertilization stage,FE0>j_!1Tq
4_AAACATGCATTGTCAG-1,EN-fetal-late,late fetal,LaFet1,6873,2652.0,5268.0,4626.0,0.48,1.137788,0.021418,...,2,tissue,glutamatergic neuron,10x multiome,normal,male,cortical plate,unknown,23rd week post-fertilization stage,3sotId$RTP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150666_TTTGTGAAGACAACAG-1,Oligodendrocytes,adulthood,Adult1,926,545.0,6654.0,5913.0,0.34,1.029844,1.500938,...,2,tissue,oligodendrocyte,10x multiome,normal,female,dorsolateral prefrontal cortex,unknown,20-year-old stage,=x`E$@Xdkm
150666_TTTGTGAAGGCTGTGC-1,OPC,adulthood,Adult1,1578,1023.0,5996.0,5311.0,0.03,0.712973,0.121433,...,2,tissue,oligodendrocyte precursor cell,10x multiome,normal,female,dorsolateral prefrontal cortex,unknown,20-year-old stage,#bnI9b>C>}
150666_TTTGTGAAGTAAGAAC-1,Oligodendrocytes,adulthood,Adult1,352,303.0,4787.0,4226.0,0.44,1.304045,0.939702,...,2,tissue,oligodendrocyte,10x multiome,normal,female,dorsolateral prefrontal cortex,unknown,20-year-old stage,mN;XZnza^;
150666_TTTGTGAAGTCTTGAA-1,Oligodendrocytes,adulthood,Adult1,1676,1028.0,3336.0,2957.0,0.25,0.715888,0.119617,...,2,tissue,oligodendrocyte,10x multiome,normal,female,dorsolateral prefrontal cortex,unknown,20-year-old stage,k#NQ|;U)%U


In [None]:
(df_cortex["age_group"] == "adulthood").sum()

7396

## Context-aware single-cell multiomics approach identifies cell-type-specific lung cancer susceptibility genes.
Link: https://datasets.cellxgene.cziscience.com/7e6a97a5-5143-4773-ba41-b95f45a30526.h5ad

In [20]:
# Load the .h5ad file
adata_lung = sc.read_h5ad("datasets/7e6a97a5-5143-4773-ba41-b95f45a30526.h5ad")

In [25]:
adata_lung

AnnData object with n_obs × n_vars = 116778 × 35467
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'author_cell_type', 'donor_id', 'Smoking', 'tissue_ontology_term_id', 'tissue_type', 'assay_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'suspension_type', 'is_primary_data', 'cell_type', 'assay', 'disease', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    uns: 'author_cell_type_colors', 'citation', 'organism', 'organism_ontology_term_id', 'schema_reference', 'schema_version', 'title'
    obsm: 'X_umap', 'X_umap.RNA', 'X_umap.SCT_harm'

In [22]:
df_lung = adata_lung.obs
df_lung

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,author_cell_type,donor_id,Smoking,tissue_ontology_term_id,tissue_type,assay_ontology_term_id,...,suspension_type,is_primary_data,cell_type,assay,disease,sex,tissue,self_reported_ethnicity,development_stage,observation_joinid
M1_AAACAGCCAAACGCGA-1,M1,3796.0,1579,4.083246,T,MN1,non_smoker,UBERON:0002048,tissue,EFO:0030059,...,nucleus,True,T cell,10x multiome,normal,male,lung,Korean,60-year-old stage,*R%U~oSa?Q
M1_AAACAGCCATCATGTG-1,M1,1768.0,1185,3.280543,Lymphatic,MN1,non_smoker,UBERON:0002048,tissue,EFO:0030059,...,nucleus,True,lymphatic endothelial cell of trabecula,10x multiome,normal,male,lung,Korean,60-year-old stage,g*lVvQlfbP
M1_AAACAGCCATCGCTTT-1,M1,1857.0,1174,6.300485,T,MN1,non_smoker,UBERON:0002048,tissue,EFO:0030059,...,nucleus,True,T cell,10x multiome,normal,male,lung,Korean,60-year-old stage,k3fp-K}&~o
M1_AAACATGCAACCCTCC-1,M1,1972.0,1211,7.910751,NK,MN1,non_smoker,UBERON:0002048,tissue,EFO:0030059,...,nucleus,True,natural killer cell,10x multiome,normal,male,lung,Korean,60-year-old stage,JWrFiX4C^k
M1_AAACATGCACTAGCGT-1,M1,5185.0,2140,4.783028,Artery,MN1,non_smoker,UBERON:0002048,tissue,EFO:0030059,...,nucleus,True,endothelial cell of artery,10x multiome,normal,male,lung,Korean,60-year-old stage,_bU(UbU5zm
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N17_20_TTTGTCTAGTATTGGC-1,N17_20,5102.0,2237,5.488044,B,FN4,non_smoker,UBERON:0002048,tissue,EFO:0030059,...,nucleus,True,B cell,10x multiome,normal,female,lung,Korean,68-year-old stage,7{pggq3>t<
N17_20_TTTGTGAAGGCGAAAC-1,N17_20,6261.0,2036,4.999201,Vein,FN4,non_smoker,UBERON:0002048,tissue,EFO:0030059,...,nucleus,True,vein endothelial cell of respiratory system,10x multiome,normal,female,lung,Korean,68-year-old stage,$hWyZ0<!<<
N17_20_TTTGTGAAGGGCTAAA-1,N17_20,3927.0,1807,8.428826,Fibroblast,FN4,non_smoker,UBERON:0002048,tissue,EFO:0030059,...,nucleus,True,fibroblast of lung,10x multiome,normal,female,lung,Korean,68-year-old stage,dcEhw^4`aX
N17_20_TTTGTGGCAGTCTAGC-1,N17_20,4382.0,1705,8.649019,Lymphatic,FN4,non_smoker,UBERON:0002048,tissue,EFO:0030059,...,nucleus,True,lymphatic endothelial cell of trabecula,10x multiome,normal,female,lung,Korean,68-year-old stage,QLXQafu?-6


## Single-nucleus RNA-seq of the Adult Human Kidney (Version 2.0)
Link: https://datasets.cellxgene.cziscience.com/f337b525-c8f7-4c96-8cfe-f258a9f5ca48.h5ad  

In [3]:
# Load the .h5ad file
adata_kidney_2 = sc.read_h5ad("datasets/single-nuclei/f337b525-c8f7-4c96-8cfe-f258a9f5ca48.h5ad", backed="r")

In [8]:
df_kidney_2 = adata_kidney_2.obs
# df_kidney_2_normal = adata_kidney_2_normal.obs

In [10]:
df_kidney_2.shape

(1388643, 46)

In [11]:
cell_types = list(set(df_kidney_2["cell_type"].to_list()))
cell_types

['mucosal invariant T cell',
 'parietal epithelial cell',
 'CD8-alpha-positive thymic conventional dendritic cell',
 'effector memory CD8-positive, alpha-beta T cell, terminally differentiated',
 'kidney resident macrophage',
 'kidney loop of Henle cortical thick ascending limb epithelial cell',
 'neural cell',
 'CD8-alpha-negative thymic conventional dendritic cell',
 'kidney loop of Henle thick ascending limb epithelial cell',
 'macula densa epithelial cell',
 'mast cell',
 'papillary tips cell',
 'vasa recta ascending limb cell',
 'plasma cell',
 'kidney collecting duct principal cell',
 'mesangial cell',
 'vascular associated smooth muscle cell',
 'kidney distal convoluted tubule epithelial cell',
 'adventitial fibroblast',
 'kidney interstitial myofibroblast',
 'peritubular capillary endothelial cell',
 'podocyte',
 'kidney outer medulla peritubular capillary cell',
 'kidney collecting duct alpha-intercalated cell',
 'kidney interstitial fibroblast',
 'B cell',
 'kidney loop of He

In [12]:
# change this if your labels are in a different column (e.g., "celltype_l2")
col = "cell_type"

counts_dict = (
    adata_kidney_2.obs[col]
    .value_counts()
    .reindex(cell_types, fill_value=0)
    .astype(int)
    .to_dict()
)

counts_dict

{'mucosal invariant T cell': 800,
 'parietal epithelial cell': 13790,
 'CD8-alpha-positive thymic conventional dendritic cell': 862,
 'effector memory CD8-positive, alpha-beta T cell, terminally differentiated': 1067,
 'kidney resident macrophage': 22875,
 'kidney loop of Henle cortical thick ascending limb epithelial cell': 92909,
 'neural cell': 477,
 'CD8-alpha-negative thymic conventional dendritic cell': 4320,
 'kidney loop of Henle thick ascending limb epithelial cell': 229803,
 'macula densa epithelial cell': 16158,
 'mast cell': 881,
 'papillary tips cell': 5555,
 'vasa recta ascending limb cell': 17115,
 'plasma cell': 4707,
 'kidney collecting duct principal cell': 92249,
 'mesangial cell': 2391,
 'vascular associated smooth muscle cell': 5152,
 'kidney distal convoluted tubule epithelial cell': 66034,
 'adventitial fibroblast': 9271,
 'kidney interstitial myofibroblast': 3956,
 'peritubular capillary endothelial cell': 1080,
 'podocyte': 16137,
 'kidney outer medulla peritub