# Environment setup

In [87]:
# Standard library imports
import os
import sys
import warnings
from pathlib import Path
from typing import List, Union, Tuple

# Third party imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import anndata
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from umap import UMAP

# Project imports
from abc_atlas_access.abc_atlas_cache.abc_project_cache import AbcProjectCache

# Suppress warnings
warnings.filterwarnings('ignore')

# Set the current working directory
os.chdir('/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_SRRM3')

from importlib import reload
from BrainAtlas.mouse.ac_create_scRNAseq_set_functions import *
reload(sys.modules['BrainAtlas.mouse.ac_create_scRNAseq_set_functions'])

# Print the current working directory to confirm the change
print(f"Current working directory: {os.getcwd()}")

Current working directory: /beegfs/scratch/ric.broccoli/kubacki.michal/SRF_SRRM3


In [2]:
pd.set_option('display.max_columns', None)

In [3]:
%%capture
# Initialize cache
download_base = Path('./DATA/abc_atlas')
download_base.mkdir(parents=True, exist_ok=True)
abc_cache = AbcProjectCache.from_cache_dir(download_base)
abc_cache.load_latest_manifest()
abc_cache.current_manifest

# Dataset exploration

In [90]:
# List available directories
print("Available directories:")
print(abc_cache.list_directories)  # Access the list attribute directly, don't call it as a function

# List available metadata files in WMB-10X directory
print("\nAvailable metadata files in WMB-10X directory:")
print(abc_cache.list_metadata_files('WMB-10X'))

Available directories:
['Allen-CCF-2020', 'MERFISH-C57BL6J-638850', 'MERFISH-C57BL6J-638850-CCF', 'MERFISH-C57BL6J-638850-imputed', 'MERFISH-C57BL6J-638850-sections', 'WHB-10Xv3', 'WHB-taxonomy', 'WMB-10X', 'WMB-10XMulti', 'WMB-10Xv2', 'WMB-10Xv3', 'WMB-neighborhoods', 'WMB-taxonomy', 'Zeng-Aging-Mouse-10Xv3', 'Zeng-Aging-Mouse-WMB-taxonomy', 'Zhuang-ABCA-1', 'Zhuang-ABCA-1-CCF', 'Zhuang-ABCA-2', 'Zhuang-ABCA-2-CCF', 'Zhuang-ABCA-3', 'Zhuang-ABCA-3-CCF', 'Zhuang-ABCA-4', 'Zhuang-ABCA-4-CCF']

Available metadata files in WMB-10X directory:
['cell_metadata', 'cell_metadata_with_cluster_annotation', 'example_genes_all_cells_expression', 'gene', 'region_of_interest_metadata']


In [5]:
# Define genes and regions to analyze
genes = ['Srrm3', 'Srrm4'] # , 'Celsr3'
regions = ["MOp","SSp","HIP","STRd","MB","CB"]

## Cell metadata (get_metadata_dataframe - WMB-10X)

In [92]:
# Load and process metadata files
print("Loading metadata files...")

# Cell metadata
cell_metadata = abc_cache.get_metadata_dataframe(
    directory='WMB-10X',
    file_name='cell_metadata'
)

# Cell metadata with cluster annotation
cell_metadata_with_cluster = abc_cache.get_metadata_dataframe(
    directory='WMB-10X', 
    file_name='cell_metadata_with_cluster_annotation'
)

# Example genes expression
example_genes = abc_cache.get_metadata_dataframe(
    directory='WMB-10X',
    file_name='example_genes_all_cells_expression'
)

# Gene metadata
gene_metadata = abc_cache.get_metadata_dataframe(
    directory='WMB-10X',
    file_name='gene'
)

# Region metadata
region_metadata = abc_cache.get_metadata_dataframe(
    directory='WMB-10X',
    file_name='region_of_interest_metadata'
)

Loading metadata files...


In [93]:
print("Cell metadata shape:", cell_metadata.shape)
print("Cell metadata with cluster shape:", cell_metadata_with_cluster.shape)
print("Example genes shape:", example_genes.shape)
print("Gene metadata shape:", gene_metadata.shape)
print("Region metadata shape:", region_metadata.shape)

Cell metadata shape: (4042976, 17)
Cell metadata with cluster shape: (4042976, 28)
Example genes shape: (4042976, 11)
Gene metadata shape: (32285, 5)
Region metadata shape: (29, 5)


In [94]:
# Print column names for each metadata DataFrame
print("\nCell metadata columns:")
print("-" * 20)
print(", ".join(cell_metadata.columns))

print("\nCell metadata with cluster columns:")
print("-" * 35)
print(", ".join(cell_metadata_with_cluster.columns))

print("\nExample genes columns:")
print("-" * 20)
print(", ".join(example_genes.columns))

print("\nGene metadata columns:")
print("-" * 20)
print(", ".join(gene_metadata.columns))

print("\nRegion metadata columns:")
print("-" * 25)
print(", ".join(region_metadata.columns))



Cell metadata columns:
--------------------
cell_label, cell_barcode, barcoded_cell_sample_label, library_label, feature_matrix_label, entity, brain_section_label, library_method, region_of_interest_acronym, donor_label, donor_genotype, donor_sex, dataset_label, x, y, cluster_alias, abc_sample_id

Cell metadata with cluster columns:
-----------------------------------
cell_label, cell_barcode, barcoded_cell_sample_label, library_label, feature_matrix_label, entity, brain_section_label, library_method, region_of_interest_acronym, donor_label, donor_genotype, donor_sex, dataset_label, x, y, cluster_alias, neurotransmitter, class, subclass, supertype, cluster, neurotransmitter_color, class_color, subclass_color, supertype_color, cluster_color, region_of_interest_order, region_of_interest_color

Example genes columns:
--------------------
cell_label, Slc32a1, Slc17a7, Slc6a5, Slc17a6, Slc6a2, Slc17a8, Tac2, Slc6a4, Slc6a3, Slc18a3

Gene metadata columns:
--------------------
gene_identifi

In [95]:
cell_metadata_with_cluster.head(3)

Unnamed: 0,cell_label,cell_barcode,barcoded_cell_sample_label,library_label,feature_matrix_label,entity,brain_section_label,library_method,region_of_interest_acronym,donor_label,donor_genotype,donor_sex,dataset_label,x,y,cluster_alias,neurotransmitter,class,subclass,supertype,cluster,neurotransmitter_color,class_color,subclass_color,supertype_color,cluster_color,region_of_interest_order,region_of_interest_color
0,GCGAGAAGTTAAGGGC-410_B05,GCGAGAAGTTAAGGGC,410_B05,L8TX_201030_01_C12,WMB-10Xv3-HPF,cell,,10Xv3,RHP,Snap25-IRES2-Cre;Ai14-550850,Ai14(RCL-tdT)/wt,F,WMB-10Xv3,23.146826,-3.086639,1,Glut,01 IT-ET Glut,018 L2 IT PPP-APr Glut,0082 L2 IT PPP-APr Glut_3,0326 L2 IT PPP-APr Glut_3,#2B93DF,#FA0087,#0F6632,#266DFF,#64661F,15,#CCB05C
1,AATGGCTCAGCTCCTT-411_B06,AATGGCTCAGCTCCTT,411_B06,L8TX_201029_01_E10,WMB-10Xv3-HPF,cell,,10Xv3,RHP,Snap25-IRES2-Cre;Ai14-550851,Ai14(RCL-tdT)/wt,F,WMB-10Xv3,23.138481,-3.022,1,Glut,01 IT-ET Glut,018 L2 IT PPP-APr Glut,0082 L2 IT PPP-APr Glut_3,0326 L2 IT PPP-APr Glut_3,#2B93DF,#FA0087,#0F6632,#266DFF,#64661F,15,#CCB05C
2,AACACACGTTGCTTGA-410_B05,AACACACGTTGCTTGA,410_B05,L8TX_201030_01_C12,WMB-10Xv3-HPF,cell,,10Xv3,RHP,Snap25-IRES2-Cre;Ai14-550850,Ai14(RCL-tdT)/wt,F,WMB-10Xv3,23.472557,-2.992709,1,Glut,01 IT-ET Glut,018 L2 IT PPP-APr Glut,0082 L2 IT PPP-APr Glut_3,0326 L2 IT PPP-APr Glut_3,#2B93DF,#FA0087,#0F6632,#266DFF,#64661F,15,#CCB05C


In [96]:
print("Library methods used:", cell_metadata_with_cluster.library_method.unique(), "\n")
print("Brain regions analyzed:", cell_metadata_with_cluster.region_of_interest_acronym.unique(), "\n")
print("Brain sections analyzed:", cell_metadata_with_cluster.brain_section_label.unique(), "\n")
print("Number of unique cell clusters:", len(list(cell_metadata_with_cluster.cluster_alias.unique())), "\n")

Library methods used: ['10Xv3' '10Xv2' '10xRSeq_Mult'] 

Brain regions analyzed: ['RHP' 'RSP' 'ACA' 'PL-ILA-ORB' 'AUD-TEa-PERI-ECT' 'SS-GU-VISC' 'MO-FRP'
 'PAL' 'sAMY' 'CTXsp' 'HY' 'STRv' 'OLF' 'LSX' 'AI' 'STRd' 'VIS-PTLp' 'VIS'
 'TH' 'MOp' 'ENT' 'HIP' 'P' 'MB' 'MY' 'CB' 'AUD' 'SSp' 'TEa-PERI-ECT'] 

Brain sections analyzed: [nan] 

Number of unique cell clusters: 5322 



In [97]:
region_metadata.head(3)

Unnamed: 0,label,acronym,name,order,color_hex_triplet
0,WMB-MO-FRP,MO-FRP,Somatomotor - Frontal pole,0,#3DCC7C
1,WMB-MOp,MOp,Primary motor area,1,#179968
2,WMB-SS-GU-VISC,SS-GU-VISC,Somatosensory/gustatory/visceral areas,2,#2E8599


In [98]:
cell_metadata_with_cluster['region_of_interest_acronym'].value_counts()

region_of_interest_acronym
MB                  367029
VIS                 321908
OLF                 280744
HY                  262175
TH                  261009
MOp                 247660
MY                  192533
CB                  182004
HIP                 176122
P                   143616
CTXsp               122208
sAMY                120764
RSP                 112248
ENT                 110210
PAL                 108046
PL-ILA-ORB          106122
SS-GU-VISC          103192
ACA                 102794
RHP                 102008
AI                   99046
SSp                  75852
MO-FRP               71399
AUD                  70564
TEa-PERI-ECT         59160
STRd                 55626
VIS-PTLp             54625
LSX                  53819
STRv                 53573
AUD-TEa-PERI-ECT     26920
Name: count, dtype: int64

## Annotation metadata (get_metadata_dataframe - WMB-taxonomy)

In [99]:
print(abc_cache.list_metadata_files('WMB-taxonomy'))

['cluster', 'cluster_annotation_term', 'cluster_annotation_term_set', 'cluster_annotation_term_with_counts', 'cluster_to_cluster_annotation_membership', 'cluster_to_cluster_annotation_membership_color', 'cluster_to_cluster_annotation_membership_pivoted']


In [100]:
# Print column names for each taxonomy metadata file
print("\nCluster metadata columns:")
print("-" * 25)
print(", ".join(abc_cache.get_metadata_dataframe('WMB-taxonomy', 'cluster').columns))

print("\nCluster annotation term columns:")
print("-" * 35)
print(", ".join(abc_cache.get_metadata_dataframe('WMB-taxonomy', 'cluster_annotation_term').columns))

print("\nCluster annotation term set columns:")
print("-" * 40)
print(", ".join(abc_cache.get_metadata_dataframe('WMB-taxonomy', 'cluster_annotation_term_set').columns))

print("\nCluster annotation term with counts columns:")
print("-" * 45)
print(", ".join(abc_cache.get_metadata_dataframe('WMB-taxonomy', 'cluster_annotation_term_with_counts').columns))

print("\nCluster to cluster annotation membership columns:")
print("-" * 50)
print(", ".join(abc_cache.get_metadata_dataframe('WMB-taxonomy', 'cluster_to_cluster_annotation_membership').columns))

print("\nCluster to cluster annotation membership color columns:")
print("-" * 55)
print(", ".join(abc_cache.get_metadata_dataframe('WMB-taxonomy', 'cluster_to_cluster_annotation_membership_color').columns))

print("\nCluster to cluster annotation membership pivoted columns:")
print("-" * 60)
print(", ".join(abc_cache.get_metadata_dataframe('WMB-taxonomy', 'cluster_to_cluster_annotation_membership_pivoted').columns))



Cluster metadata columns:
-------------------------
cluster_alias, number_of_cells, label

Cluster annotation term columns:
-----------------------------------
label, name, cluster_annotation_term_set_label, parent_term_label, parent_term_set_label, term_set_order, term_order, cluster_annotation_term_set_name, color_hex_triplet

Cluster annotation term set columns:
----------------------------------------
label, name, description, order

Cluster annotation term with counts columns:
---------------------------------------------
label, name, cluster_annotation_term_set_label, parent_term_label, parent_term_set_label, term_set_order, term_order, cluster_annotation_term_set_name, color_hex_triplet, number_of_clusters, number_of_cells

Cluster to cluster annotation membership columns:
--------------------------------------------------
cluster_annotation_term_label, cluster_annotation_term_set_label, cluster_alias, cluster_annotation_term_name, cluster_annotation_term_set_name, number_of_ce

In [101]:
# Load taxonomy metadata files
cluster_metadata = abc_cache.get_metadata_dataframe(
    directory='WMB-taxonomy',
    file_name='cluster'
)

cluster_annotation_term = abc_cache.get_metadata_dataframe(
    directory='WMB-taxonomy',
    file_name='cluster_annotation_term'
)

cluster_annotation_term_set = abc_cache.get_metadata_dataframe(
    directory='WMB-taxonomy',
    file_name='cluster_annotation_term_set'
)

cluster_annotation_term_with_counts = abc_cache.get_metadata_dataframe(
    directory='WMB-taxonomy',
    file_name='cluster_annotation_term_with_counts'
)

cluster_to_cluster_annotation_membership = abc_cache.get_metadata_dataframe(
    directory='WMB-taxonomy',
    file_name='cluster_to_cluster_annotation_membership'
)

cluster_to_cluster_annotation_membership_color = abc_cache.get_metadata_dataframe(
    directory='WMB-taxonomy',
    file_name='cluster_to_cluster_annotation_membership_color'
)

cluster_to_cluster_annotation_membership_pivoted = abc_cache.get_metadata_dataframe(
    directory='WMB-taxonomy',
    file_name='cluster_to_cluster_annotation_membership_pivoted'
)

In [102]:
cluster_metadata.head(3)

Unnamed: 0,cluster_alias,number_of_cells,label
0,1,727,CS20230722_0001
1,10,740,CS20230722_0010
2,100,1053,CS20230722_0100


In [103]:
print(cluster_annotation_term.shape)
print(cluster_annotation_term.name.unique())
cluster_annotation_term.head(3)

(6905, 9)
['Glut' nan 'GABA' ... '5320 ILC NN_2' '5321 NK cells NN_3'
 '5322 T cells NN_4']


Unnamed: 0,label,name,cluster_annotation_term_set_label,parent_term_label,parent_term_set_label,term_set_order,term_order,cluster_annotation_term_set_name,color_hex_triplet
0,CS20230722_NEUR_Glut,Glut,CCN20230722_NEUR,,,0,0,neurotransmitter,#2B93DF
1,CS20230722_NEUR_NA,,CCN20230722_NEUR,,,0,1,neurotransmitter,#666666
2,CS20230722_NEUR_GABA,GABA,CCN20230722_NEUR,,,0,2,neurotransmitter,#FF3358


In [104]:
print(cluster_annotation_term_set.shape)
print(cluster_annotation_term_set.name.unique())
cluster_annotation_term_set.head(5)

(5, 4)
['neurotransmitter' 'class' 'subclass' 'supertype' 'cluster']


Unnamed: 0,label,name,description,order
0,CCN20230722_NEUR,neurotransmitter,Clusters are assigned based on the average exp...,0
1,CCN20230722_CLAS,class,The top level of cell type definition in the m...,1
2,CCN20230722_SUBC,subclass,The coarse level of cell type definition in th...,2
3,CCN20230722_SUPT,supertype,The second finest level of cell type definitio...,3
4,CCN20230722_CLUS,cluster,The finest level of cell type definition in th...,4


In [105]:
cluster_annotation_term_with_counts.head(3)

Unnamed: 0,label,name,cluster_annotation_term_set_label,parent_term_label,parent_term_set_label,term_set_order,term_order,cluster_annotation_term_set_name,color_hex_triplet,number_of_clusters,number_of_cells
0,CS20230722_NEUR_Glut,Glut,CCN20230722_NEUR,,,0,0,neurotransmitter,#2B93DF,2561,2054137
1,CS20230722_NEUR_NA,,CCN20230722_NEUR,,,0,1,neurotransmitter,#666666,127,1089152
2,CS20230722_NEUR_GABA,GABA,CCN20230722_NEUR,,,0,2,neurotransmitter,#FF3358,1991,834601


In [106]:
print(cluster_to_cluster_annotation_membership.shape)
cluster_to_cluster_annotation_membership.head(3)

(26610, 7)


Unnamed: 0,cluster_annotation_term_label,cluster_annotation_term_set_label,cluster_alias,cluster_annotation_term_name,cluster_annotation_term_set_name,number_of_cells,color_hex_triplet
0,CS20230722_CLUS_0001,CCN20230722_CLUS,128,0001 CLA-EPd-CTX Car3 Glut_1,cluster,4262,#00664E
1,CS20230722_CLUS_0002,CCN20230722_CLUS,129,0002 CLA-EPd-CTX Car3 Glut_1,cluster,3222,#5C79CC
2,CS20230722_CLUS_0003,CCN20230722_CLUS,130,0003 CLA-EPd-CTX Car3 Glut_1,cluster,12216,#86FF4D


In [107]:
cluster_to_cluster_annotation_membership_color.head(3)

Unnamed: 0,cluster_alias,neurotransmitter_color,class_color,subclass_color,supertype_color,cluster_color
0,1,#2B93DF,#FA0087,#0F6632,#266DFF,#64661F
1,2,#2B93DF,#FA0087,#0F6632,#266DFF,#CCA73D
2,3,#2B93DF,#FA0087,#0F6632,#002BCC,#99000D


In [108]:
print(cluster_to_cluster_annotation_membership_pivoted.shape)
print(cluster_to_cluster_annotation_membership_pivoted.neurotransmitter.unique())
cluster_to_cluster_annotation_membership_pivoted.head(3)

(5322, 6)
['Glut' 'GABA' 'Chol' 'Glut-GABA' nan 'Dopa' 'Hist' 'GABA-Glyc' 'Sero'
 'Nora']


Unnamed: 0,cluster_alias,neurotransmitter,class,subclass,supertype,cluster
0,1,Glut,01 IT-ET Glut,018 L2 IT PPP-APr Glut,0082 L2 IT PPP-APr Glut_3,0326 L2 IT PPP-APr Glut_3
1,2,Glut,01 IT-ET Glut,018 L2 IT PPP-APr Glut,0082 L2 IT PPP-APr Glut_3,0327 L2 IT PPP-APr Glut_3
2,3,Glut,01 IT-ET Glut,018 L2 IT PPP-APr Glut,0081 L2 IT PPP-APr Glut_2,0322 L2 IT PPP-APr Glut_2


In [109]:
# feature matrix label
available_regions = list(cell_metadata_with_cluster['feature_matrix_label'].unique())   
print(f"Available regions (feature_matrix_label): \n {available_regions}, length: {len(available_regions)}")

# region of interest acronym
print("\n")
available_regions = list(cell_metadata_with_cluster['region_of_interest_acronym'].unique())
print(f"Available regions (region_of_interest_acronym): \n {available_regions}, length: {len(available_regions)}")

# brain section label
print("\n")
brain_sections = list(cell_metadata_with_cluster['brain_section_label'].unique())
print(f"brain_section_label: \n {brain_sections}, length: {len(brain_sections)}")

# entity
print("\n")
entities = list(cell_metadata_with_cluster['entity'].unique())
print(f"entity: \n {entities}, length: {len(entities)}")


Available regions (feature_matrix_label): 
 ['WMB-10Xv3-HPF', 'WMB-10Xv3-Isocortex-1', 'WMB-10Xv3-PAL', 'WMB-10Xv3-STR', 'WMB-10Xv3-CTXsp', 'WMB-10Xv3-HY', 'WMB-10Xv3-OLF', 'WMB-10Xv3-TH', 'WMB-10Xv3-P', 'WMB-10Xv3-MB', 'WMB-10Xv3-MY', 'WMB-10Xv3-CB', 'WMB-10Xv3-Isocortex-2', 'WMB-10Xv2-HPF', 'WMB-10Xv2-Isocortex-1', 'WMB-10Xv2-HY', 'WMB-10Xv2-TH', 'WMB-10Xv2-OLF', 'WMB-10Xv2-CTXsp', 'WMB-10Xv2-MB', 'WMB-10Xv2-Isocortex-2', 'WMB-10Xv2-Isocortex-3', 'WMB-10Xv2-Isocortex-4', 'WMB-10XMulti'], length: 24


Available regions (region_of_interest_acronym): 
 ['RHP', 'RSP', 'ACA', 'PL-ILA-ORB', 'AUD-TEa-PERI-ECT', 'SS-GU-VISC', 'MO-FRP', 'PAL', 'sAMY', 'CTXsp', 'HY', 'STRv', 'OLF', 'LSX', 'AI', 'STRd', 'VIS-PTLp', 'VIS', 'TH', 'MOp', 'ENT', 'HIP', 'P', 'MB', 'MY', 'CB', 'AUD', 'SSp', 'TEa-PERI-ECT'], length: 29


brain_section_label: 
 [nan], length: 1


entity: 
 ['cell'], length: 1
