# Cell type annotation dataset processing notebook

This notebook contains processing scripts for each dataset used for cell type annotation.

First, run the header block below. Then, you can run any one of the data processing block without runing the full notebook.

In [1]:
### Uncomment and run this block to install necessary packages
# pip install -U anndata scanpy gdown pandas tqdm mygene

In [2]:
### Dev only
# %load_ext autoreload
# %aimport data_processing_utils
# %autoreload 1

In [6]:
# Header block
import os
from pathlib import Path

import anndata as ad
import gdown
import mygene
import pandas as pd
import scanpy as sc
from sklearn.preprocessing import LabelEncoder

from Heimdall.data_processing_utils import symbol_to_ensembl_from_ensembl

### Change project dir here ###
# PROJECT_DIR = Path().resolve().parent  # use the project repository root dir as the project dir
PROJECT_DIR = Path("/work/magroup/shared/Heimdall")
#################################

DATA_DIR = PROJECT_DIR / "data" / "cell_type_annotation"
RAW_DATA_DIR = DATA_DIR / "raw"
PROCESSED_DATA_DIR = DATA_DIR / "processed"

os.makedirs(RAW_DATA_DIR, exist_ok=True)
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

print(f"Raw data will be saved to: {RAW_DATA_DIR}")
print(f"Processed data will be saved to: {PROCESSED_DATA_DIR}")

Raw data will be saved to: /work/magroup/shared/Heimdall/data/cell_type_annotation/raw
Processed data will be saved to: /work/magroup/shared/Heimdall/data/cell_type_annotation/processed


  from .autonotebook import tqdm as notebook_tqdm


## Pancreas

https://openproblems.bio/results/label_projection/

In [2]:
!wget https://ndownloader.figshare.com/files/36086813 -O {RAW_DATA_DIR}/pancreas.h5ad

--2024-07-10 11:05:08--  https://ndownloader.figshare.com/files/36086813
Resolving ndownloader.figshare.com (ndownloader.figshare.com)... 54.78.158.183, 54.194.168.157, 34.250.28.103, ...
Connecting to ndownloader.figshare.com (ndownloader.figshare.com)|54.78.158.183|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/36086813/pancreas.h5ad?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20240710/eu-west-1/s3/aws4_request&X-Amz-Date=20240710T150509Z&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signature=fad9479828cec518f80fb31951c85a22f5ce3d5e240699af12021b9c067f2ae5 [following]
--2024-07-10 11:05:09--  https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/36086813/pancreas.h5ad?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIYCQYOYV5JSSROOA/20240710/eu-west-1/s3/aws4_request&X-Amz-Date=20240710T150509Z&X-Amz-Expires=10&X-Amz-SignedHeaders=host&X-Amz-Signature=fad9479828cec51

In [3]:
# Load raw data
data = ad.read_h5ad(RAW_DATA_DIR / "pancreas.h5ad")
data

AnnData object with n_obs × n_vars = 16382 × 19093
    obs: 'tech', 'celltype', 'size_factors'
    layers: 'counts'

In [4]:
# Filtering
sc.pp.filter_genes(data, min_cells=100, inplace=True)
data

AnnData object with n_obs × n_vars = 16382 × 15309
    obs: 'tech', 'celltype', 'size_factors'
    var: 'n_cells'
    layers: 'counts'

In [5]:
# Gene id mapping
symbol_to_ensembl_mapping = symbol_to_ensembl_from_ensembl(
    data_dir=PROJECT_DIR / "data", genes=data.var.index.tolist(), species="human")
data.uns["gene_mapping:symbol_to_ensembl"] = symbol_to_ensembl_mapping.mapping_full

data.var["gene_symbol"] = data.var.index
data.var["gene_ensembl"] = data.var["gene_symbol"].map(symbol_to_ensembl_mapping.mapping_combined.get)
data.var.index = data.var.index.map(symbol_to_ensembl_mapping.mapping_reduced)

data.var

Mapping data directory: /work/magroup/shared/Heimdall/data/gene_mapping/ensembl/human
Loading mapping from cache: /work/magroup/shared/Heimdall/data/gene_mapping/ensembl/human/symbol_to_ensembl.json
Successfully mapped 14,437 out of 15,309 genes (94.3%)


Unnamed: 0,n_cells,gene_symbol,gene_ensembl
ENSG00000121410,3178,A1BG,ENSG00000121410
ENSG00000148584,5937,A1CF,ENSG00000148584
ENSG00000175899,789,A2M,ENSG00000175899
ENSG00000166535,487,A2ML1,ENSG00000166535
ENSG00000128274,879,A4GALT,ENSG00000128274
...,...,...,...
ENSG00000070476,5128,ZXDC,ENSG00000070476
ENSG00000162378,6073,ZYG11B,ENSG00000162378
ENSG00000159840,5334,ZYX,ENSG00000159840
ENSG00000074755,5093,ZZEF1,ENSG00000074755


In [6]:
# Standardize attributes
data.obs["species"] = "Homo sapiens"
le = LabelEncoder()

data.obs["task_celltype"] = le.fit_transform(data.obs["celltype"])
data.uns["celltype_order"] = le.classes_.tolist()

data.obs["batch"] = le.fit_transform(data.obs["tech"])
data.uns["batch_order"] = le.classes_.tolist()

data

AnnData object with n_obs × n_vars = 16382 × 15309
    obs: 'tech', 'celltype', 'size_factors', 'species', 'task_celltype', 'batch'
    var: 'n_cells', 'gene_symbol', 'gene_ensembl'
    uns: 'gene_mapping:symbol_to_ensembl', 'celltype_order', 'batch_order'
    layers: 'counts'

In [7]:
# Save processed data
data.write_h5ad(PROCESSED_DATA_DIR / "pancreas.h5ad")

## Zheng68k

In [8]:
!wget https://cf.10xgenomics.com/samples/cell-exp/1.1.0/fresh_68k_pbmc_donor_a/fresh_68k_pbmc_donor_a_filtered_gene_bc_matrices.tar.gz --no-check-certificate -O {RAW_DATA_DIR}/zheng68k.tar.gz
!tar -xzvf {RAW_DATA_DIR}/zheng68k.tar.gz -C {RAW_DATA_DIR} && mv {RAW_DATA_DIR}/filtered_matrices_mex/hg19 {RAW_DATA_DIR}/zheng68k

!wget https://github.com/10XGenomics/single-cell-3prime-paper/raw/989aeed58745e01fe13acc439bdc19c2c185a1aa/pbmc68k_analysis/68k_pbmc_barcodes_annotation.tsv -O {RAW_DATA_DIR}/zheng68k/annotation.tsv

--2024-07-10 11:07:12--  https://cf.10xgenomics.com/samples/cell-exp/1.1.0/fresh_68k_pbmc_donor_a/fresh_68k_pbmc_donor_a_filtered_gene_bc_matrices.tar.gz
Resolving cf.10xgenomics.com (cf.10xgenomics.com)... 104.18.1.173, 104.18.0.173, 2606:4700::6812:ad, ...
Connecting to cf.10xgenomics.com (cf.10xgenomics.com)|104.18.1.173|:443... connected.
  Issued certificate has expired.
HTTP request sent, awaiting response... 200 OK
Length: 124442812 (119M) [application/x-tar]
Saving to: ‘/work/magroup/shared/Heimdall/data/cell_type_annotation/raw/zheng68k.tar.gz’


2024-07-10 11:07:14 (77.7 MB/s) - ‘/work/magroup/shared/Heimdall/data/cell_type_annotation/raw/zheng68k.tar.gz’ saved [124442812/124442812]

filtered_matrices_mex/
filtered_matrices_mex/hg19/
filtered_matrices_mex/hg19/barcodes.tsv
filtered_matrices_mex/hg19/genes.tsv
filtered_matrices_mex/hg19/matrix.mtx
--2024-07-10 11:07:25--  https://github.com/10XGenomics/single-cell-3prime-paper/raw/989aeed58745e01fe13acc439bdc19c2c185a1aa/pbmc6

In [9]:
# Consolidate anndata
data = ad.read_mtx(RAW_DATA_DIR / "zheng68k" / "matrix.mtx").T

barcodes = pd.read_csv(RAW_DATA_DIR / "zheng68k" / "barcodes.tsv", header=None)
genes = pd.read_csv(RAW_DATA_DIR / "zheng68k" / "genes.tsv", sep="\t", header=None, names=["Symbol"], index_col=0)

annot = pd.read_csv(RAW_DATA_DIR / "zheng68k" / "annotation.tsv", sep="\t").set_index("barcodes").reindex(barcodes[0])
annot.index.name = None

data.obs = annot
data.var = genes

data

AnnData object with n_obs × n_vars = 68579 × 32738
    obs: 'TSNE.1', 'TSNE.2', 'celltype'
    var: 'Symbol'

In [10]:
# Filtering
sc.pp.filter_genes(data, min_cells=100, inplace=True)
data

AnnData object with n_obs × n_vars = 68579 × 11240
    obs: 'TSNE.1', 'TSNE.2', 'celltype'
    var: 'Symbol', 'n_cells'

In [11]:
# Standardize attributes
data.obs["species"] = "Homo sapiens"
le = LabelEncoder()

data.obs["task_celltype"] = le.fit_transform(data.obs["celltype"])
data.uns["celltype_order"] = le.classes_.tolist()

data.var.rename(columns={"Symbol": "gene_symbol"}, inplace=True)
data.var["gene_ensembl"] = data.var.index

data

AnnData object with n_obs × n_vars = 68579 × 11240
    obs: 'TSNE.1', 'TSNE.2', 'celltype', 'species', 'task_celltype'
    var: 'gene_symbol', 'n_cells', 'gene_ensembl'
    uns: 'celltype_order'

In [12]:
# Save processed data
data.write_h5ad(PROCESSED_DATA_DIR / "zheng68k.h5ad")

## Multiple Sclerosis (MS)

https://github.com/bowang-lab/scGPT/tree/main/data

In [13]:
gdown.download(
    url="https://drive.google.com/file/d/1casFhq4InuBNhJLMnGebzkRXM2UTTeQG/view?usp=drive_link",
    output=str(RAW_DATA_DIR / "ms.h5ad"),
    fuzzy=True,
)

# gdown.download(
#     url="https://drive.google.com/file/d/1bV1SHKVZgkcL-RmmuN51_IIUJTSJbXOi/view?usp=drive_link",
#     output=str(RAW_DATA_DIR / "ms_c.h5ad"),
#     fuzzy=True,
# )

Downloading...
From: https://drive.google.com/uc?id=1casFhq4InuBNhJLMnGebzkRXM2UTTeQG
To: /work/magroup/shared/Heimdall/data/cell_type_annotation/raw/ms.h5ad
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 47.1M/47.1M [00:00<00:00, 89.2MB/s]


'/work/magroup/shared/Heimdall/data/cell_type_annotation/raw/ms.h5ad'

In [14]:
# Load raw data
data = ad.read_h5ad(RAW_DATA_DIR / "ms.h5ad")
data

AnnData object with n_obs × n_vars = 13468 × 3000
    obs: 'Sample Characteristic[organism]', 'Sample Characteristic Ontology Term[organism]', 'Sample Characteristic[individual]', 'Sample Characteristic Ontology Term[individual]', 'Sample Characteristic[sex]', 'Sample Characteristic Ontology Term[sex]', 'Sample Characteristic[age]', 'Sample Characteristic Ontology Term[age]', 'Sample Characteristic[developmental stage]', 'Sample Characteristic Ontology Term[developmental stage]', 'Sample Characteristic[organism part]', 'Sample Characteristic Ontology Term[organism part]', 'Sample Characteristic[sampling site]', 'Sample Characteristic Ontology Term[sampling site]', 'Sample Characteristic[disease]', 'Sample Characteristic Ontology Term[disease]', 'Sample Characteristic[organism status]', 'Sample Characteristic Ontology Term[organism status]', 'Sample Characteristic[cause of death]', 'Sample Characteristic Ontology Term[cause of death]', 'Sample Characteristic[clinical history]', 'Sample 

In [15]:
# Filtering
sc.pp.filter_genes(data, min_cells=100, inplace=True)
data

AnnData object with n_obs × n_vars = 13468 × 2336
    obs: 'Sample Characteristic[organism]', 'Sample Characteristic Ontology Term[organism]', 'Sample Characteristic[individual]', 'Sample Characteristic Ontology Term[individual]', 'Sample Characteristic[sex]', 'Sample Characteristic Ontology Term[sex]', 'Sample Characteristic[age]', 'Sample Characteristic Ontology Term[age]', 'Sample Characteristic[developmental stage]', 'Sample Characteristic Ontology Term[developmental stage]', 'Sample Characteristic[organism part]', 'Sample Characteristic Ontology Term[organism part]', 'Sample Characteristic[sampling site]', 'Sample Characteristic Ontology Term[sampling site]', 'Sample Characteristic[disease]', 'Sample Characteristic Ontology Term[disease]', 'Sample Characteristic[organism status]', 'Sample Characteristic Ontology Term[organism status]', 'Sample Characteristic[cause of death]', 'Sample Characteristic Ontology Term[cause of death]', 'Sample Characteristic[clinical history]', 'Sample 

In [16]:
# Standardize attributes
data.obs["species"] = "Homo sapiens"
data.var.rename(columns={"index_column": "gene_ensembl", "gene_name": "gene_symbol"}, inplace=True)

le = LabelEncoder()

data.obs["task_celltype"] = le.fit_transform(data.obs["celltype"])
data.uns["celltype_order"] = le.classes_.tolist()

data.var.rename(columns={"index_column": "gene_ensembl", "gene_name": "gene_symbo"}, inplace=True)

data

AnnData object with n_obs × n_vars = 13468 × 2336
    obs: 'Sample Characteristic[organism]', 'Sample Characteristic Ontology Term[organism]', 'Sample Characteristic[individual]', 'Sample Characteristic Ontology Term[individual]', 'Sample Characteristic[sex]', 'Sample Characteristic Ontology Term[sex]', 'Sample Characteristic[age]', 'Sample Characteristic Ontology Term[age]', 'Sample Characteristic[developmental stage]', 'Sample Characteristic Ontology Term[developmental stage]', 'Sample Characteristic[organism part]', 'Sample Characteristic Ontology Term[organism part]', 'Sample Characteristic[sampling site]', 'Sample Characteristic Ontology Term[sampling site]', 'Sample Characteristic[disease]', 'Sample Characteristic Ontology Term[disease]', 'Sample Characteristic[organism status]', 'Sample Characteristic Ontology Term[organism status]', 'Sample Characteristic[cause of death]', 'Sample Characteristic Ontology Term[cause of death]', 'Sample Characteristic[clinical history]', 'Sample 

In [17]:
# Save processed data
data.write_h5ad(PROCESSED_DATA_DIR / "ms.h5ad")

## Myeloid (MYE)

https://github.com/bowang-lab/scGPT/tree/main/data

In [18]:
gdown.download(
    url="https://drive.google.com/file/d/1iZ5Am1uS2keIdXqnsZh4Do6y1v_LlWGd/view?usp=drive_link",
    output=str(RAW_DATA_DIR / "mye.h5ad"),
    fuzzy=True,
)

# gdown.download(
#     url="https://drive.google.com/file/d/1U556qLQMUCX1i2VZjJOVhPmpYSo71ZAj/view?usp=drive_link",
#     output=str(RAW_DATA_DIR / "mye_ref.h5ad"),
#     fuzzy=True,
# )

Downloading...
From: https://drive.google.com/uc?id=1iZ5Am1uS2keIdXqnsZh4Do6y1v_LlWGd
To: /work/magroup/shared/Heimdall/data/cell_type_annotation/raw/mye.h5ad
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43.2M/43.2M [00:00<00:00, 85.0MB/s]


'/work/magroup/shared/Heimdall/data/cell_type_annotation/raw/mye.h5ad'

In [19]:
# Load raw data
data = ad.read_h5ad(RAW_DATA_DIR / "mye.h5ad")
data

AnnData object with n_obs × n_vars = 3430 × 3000
    obs: 'cell_type', 'cancer_type', 'batch'
    uns: 'cancer_type_colors', 'cell_type_colors', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [20]:
# Filtering
sc.pp.filter_genes(data, min_cells=100, inplace=True)
data

AnnData object with n_obs × n_vars = 3430 × 2455
    obs: 'cell_type', 'cancer_type', 'batch'
    var: 'n_cells'
    uns: 'cancer_type_colors', 'cell_type_colors', 'log1p', 'neighbors', 'pca', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [21]:
# Gene id mapping
symbol_to_ensembl_mapping = data_processing_utils.symbol_to_ensembl_from_ensembl(
    data_dir=PROJECT_DIR / "data", genes=data.var.index.tolist(), species="human")
data.uns["gene_mapping:symbol_to_ensembl"] = symbol_to_ensembl_mapping.mapping_full

data.var["gene_symbol"] = data.var.index
data.var["gene_ensembl"] = data.var["gene_symbol"].map(symbol_to_ensembl_mapping.mapping_combined.get)
data.var.index = data.var.index.map(symbol_to_ensembl_mapping.mapping_reduced)

data.var

Mapping data directory: /work/magroup/shared/Heimdall/data/gene_mapping/ensembl/human
Loading mapping from cache: /work/magroup/shared/Heimdall/data/gene_mapping/ensembl/human/symbol_to_ensembl.json
Successfully mapped 2,267 out of 2,455 genes (92.3%)


Unnamed: 0,n_cells,gene_symbol,gene_ensembl
ENSG00000188290,811,HES4,ENSG00000188290
ENSG00000187608,1524,ISG15,ENSG00000187608
ENSG00000188157,103,AGRN,ENSG00000188157
ENSG00000186827,129,TNFRSF4,ENSG00000186827
ENSG00000248333,566,CDK11B,ENSG00000248333
...,...,...,...
ENSG00000160285,127,LSS,ENSG00000160285
ENSG00000160307,191,S100B,ENSG00000160307
ENSG00000198899,3395,MT-ATP6,ENSG00000198899
ENSG00000198840,3325,MT-ND3,ENSG00000198840


In [22]:
# Standardize attributes
data.obs["species"] = "Homo sapiens"
data.obs.rename(columns={"cell_type": "celltype"}, inplace=True)

le = LabelEncoder()

data.obs["task_celltype"] = le.fit_transform(data.obs["celltype"])
data.uns["celltype_order"] = le.classes_.tolist()

data.obs["batch"] = le.fit_transform(data.obs["batch"])
data.uns["batch_order"] = le.classes_.tolist()

data

AnnData object with n_obs × n_vars = 3430 × 2455
    obs: 'celltype', 'cancer_type', 'batch', 'species', 'task_celltype'
    var: 'n_cells', 'gene_symbol', 'gene_ensembl'
    uns: 'cancer_type_colors', 'cell_type_colors', 'log1p', 'neighbors', 'pca', 'umap', 'gene_mapping:symbol_to_ensembl', 'celltype_order', 'batch_order'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [23]:
# Save processed data
data.write_h5ad(PROCESSED_DATA_DIR / "mye.h5ad")

## Tabula Muris

https://tabula-muris.ds.czbiohub.org/

In [24]:
!wget https://figshare.com/ndownloader/articles/5968960/versions/3 -O {RAW_DATA_DIR}/tm.zip
!unzip {RAW_DATA_DIR}/tm.zip -d {RAW_DATA_DIR}/tm
!unzip {RAW_DATA_DIR}/tm/droplet.zip -d {RAW_DATA_DIR}/tm

--2024-07-10 11:08:00--  https://figshare.com/ndownloader/articles/5968960/versions/3
Resolving figshare.com (figshare.com)... 52.215.106.69, 52.212.236.71, 2a05:d018:1f4:d000:f6c4:ca19:842d:9398, ...
Connecting to figshare.com (figshare.com)|52.215.106.69|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 400440574 (382M) [application/zip]
Saving to: ‘/work/magroup/shared/Heimdall/data/cell_type_annotation/raw/tm.zip’


2024-07-10 11:08:15 (26.5 MB/s) - ‘/work/magroup/shared/Heimdall/data/cell_type_annotation/raw/tm.zip’ saved [400440574/400440574]

Archive:  /work/magroup/shared/Heimdall/data/cell_type_annotation/raw/tm.zip
 extracting: /work/magroup/shared/Heimdall/data/cell_type_annotation/raw/tm/metadata_droplet.csv  
 extracting: /work/magroup/shared/Heimdall/data/cell_type_annotation/raw/tm/droplet.zip  
 extracting: /work/magroup/shared/Heimdall/data/cell_type_annotation/raw/tm/annotations_droplet.csv  
Archive:  /work/magroup/shared/Heimdall/data/cell_ty

In [25]:
# Load raw data
data_list = []

tm_data_dir = RAW_DATA_DIR / "tm"
tm_droplet_data_dir = tm_data_dir / "droplet"

annot_df = pd.read_csv(tm_data_dir / "annotations_droplet.csv", low_memory=False, index_col=0)
annot_df.index.name = None
annot_df = annot_df[[
 "cell_ontology_class",
 "cell_ontology_id",
 "channel",
 "cluster.ids",
 "mouse.id",
 "mouse.sex",
 "tissue",
 "tissue_tSNE_1",
 "tissue_tSNE_2",
]].copy()  # exclude object typed (those with NaN's) columns (https://github.com/scverse/anndata/issues/558)

all_cells = set(annot_df.index)

for i in os.listdir(tm_droplet_data_dir):
    if i == ".DS_Store":
        continue

    print(f"Extracting {i}")
    tissue, channel = i.split("-")

    bcs = pd.read_csv(tm_droplet_data_dir / i / "barcodes.tsv", sep="\t", header=None, names=["raw_cell_id"])
    genes = pd.read_csv(tm_droplet_data_dir / i / "genes.tsv", sep="\t", header=None)
    data_partial = ad.read_mtx(tm_droplet_data_dir / i / "matrix.mtx").T

    bcs["cell_id"] = ["_".join((channel, i.split("-")[0])) for i in bcs["raw_cell_id"]]
    valid_cells = [i for i in bcs["cell_id"] if  i in all_cells]
    # print(f"Matched {len(valid_cells):,} out of {len(bcs):,} cells")

    bcs.set_index("cell_id", inplace=True)
    bcs.index.name = None

    genes.set_index(0, inplace=True)
    genes.index.name = None

    data_partial.var = genes
    data_partial.obs = bcs
    data_list.append(data_partial[valid_cells].copy())

data = ad.concat(data_list)[annot_df.index].copy()
data.obs = annot_df

data

Extracting Marrow-10X_P7_3
Extracting Kidney-10X_P7_5
Extracting Kidney-10X_P4_6
Extracting Lung-10X_P7_9
Extracting Spleen-10X_P4_7
Extracting Lung-10X_P8_13
Extracting Kidney-10X_P4_5
Extracting Mammary_Gland-10X_P7_13
Extracting Tongue-10X_P4_1
Extracting Trachea-10X_P8_15
Extracting Liver-10X_P7_0
Extracting Limb_Muscle-10X_P7_15
Extracting Heart_and_Aorta-10X_P7_4
Extracting Bladder-10X_P4_3
Extracting Lung-10X_P7_8
Extracting Bladder-10X_P4_4
Extracting Bladder-10X_P7_7
Extracting Marrow-10X_P7_2
Extracting Tongue-10X_P7_10
Extracting Limb_Muscle-10X_P7_14
Extracting Thymus-10X_P7_11
Extracting Spleen-10X_P7_6
Extracting Tongue-10X_P4_0
Extracting Mammary_Gland-10X_P7_12
Extracting Trachea-10X_P8_14
Extracting Liver-10X_P4_2
Extracting Liver-10X_P7_1
Extracting Lung-10X_P8_12


AnnData object with n_obs × n_vars = 55656 × 23433
    obs: 'cell_ontology_class', 'cell_ontology_id', 'channel', 'cluster.ids', 'mouse.id', 'mouse.sex', 'tissue', 'tissue_tSNE_1', 'tissue_tSNE_2'

In [26]:
# Filtering
sc.pp.filter_genes(data, min_cells=100, inplace=True)
data

AnnData object with n_obs × n_vars = 55656 × 15815
    obs: 'cell_ontology_class', 'cell_ontology_id', 'channel', 'cluster.ids', 'mouse.id', 'mouse.sex', 'tissue', 'tissue_tSNE_1', 'tissue_tSNE_2'
    var: 'n_cells'

In [27]:
# Normalize
sc.pp.normalize_total(data, target_sum=10000)
sc.pp.log1p(data)
data

AnnData object with n_obs × n_vars = 55656 × 15815
    obs: 'cell_ontology_class', 'cell_ontology_id', 'channel', 'cluster.ids', 'mouse.id', 'mouse.sex', 'tissue', 'tissue_tSNE_1', 'tissue_tSNE_2'
    var: 'n_cells'
    uns: 'log1p'

In [28]:
# Gene id mapping
symbol_to_ensembl_mapping = data_processing_utils.symbol_to_ensembl_from_ensembl(
    data_dir=PROJECT_DIR / "data", genes=data.var.index.tolist(), species="mouse")
data.uns["gene_mapping:symbol_to_ensembl"] = symbol_to_ensembl_mapping.mapping_full

data.var["gene_symbol"] = data.var.index
data.var["gene_ensembl"] = data.var["gene_symbol"].map(symbol_to_ensembl_mapping.mapping_combined.get)
data.var.index = data.var.index.map(symbol_to_ensembl_mapping.mapping_reduced)

data.var

Mapping data directory: /work/magroup/shared/Heimdall/data/gene_mapping/ensembl/mouse
Loading mapping from cache: /work/magroup/shared/Heimdall/data/gene_mapping/ensembl/mouse/symbol_to_ensembl.json
Successfully mapped 13,867 out of 15,815 genes (87.7%)


Unnamed: 0,n_cells,gene_symbol,gene_ensembl
ENSMUSG00000025902,2049,Sox17,ENSMUSG00000025902
ENSMUSG00000033845,16312,Mrpl15,ENSMUSG00000033845
ENSMUSG00000025903,15290,Lypla1,ENSMUSG00000025903
ENSMUSG00000033813,21414,Tcea1,ENSMUSG00000033813
ENSMUSG00000002459,392,Rgs20,ENSMUSG00000002459
...,...,...,...
ENSMUSG00000056673,1086,Kdm5d,ENSMUSG00000056673
ENSMUSG00000069049,7841,Eif2s3y,ENSMUSG00000069049
ENSMUSG00000068457,851,Uty,ENSMUSG00000068457
ENSMUSG00000069045,10450,Ddx3y,ENSMUSG00000069045


In [29]:
# Standardize attributes
data.obs["species"] = "Mus musculus"
data.obs.rename(columns={"cell_ontology_class": "celltype"}, inplace=True)

le = LabelEncoder()

data.obs["task_celltype"] = le.fit_transform(data.obs["celltype"])
data.uns["celltype_order"] = le.classes_.tolist()

data.obs["batch"] = le.fit_transform(data.obs["channel"])
data.uns["batch_order"] = le.classes_.tolist()

data

AnnData object with n_obs × n_vars = 55656 × 15815
    obs: 'celltype', 'cell_ontology_id', 'channel', 'cluster.ids', 'mouse.id', 'mouse.sex', 'tissue', 'tissue_tSNE_1', 'tissue_tSNE_2', 'species', 'task_celltype', 'batch'
    var: 'n_cells', 'gene_symbol', 'gene_ensembl'
    uns: 'log1p', 'gene_mapping:symbol_to_ensembl', 'celltype_order', 'batch_order'

In [30]:
# Save processed data
data.write_h5ad(PROCESSED_DATA_DIR / "tm.h5ad")

## Allen Mouse Brain (AMB)

https://cellxgene.cziscience.com/collections/45f0f67d-4b69-4a3c-a4e8-a63b962e843f

In [31]:
!wget https://datasets.cellxgene.cziscience.com/1de08aa2-b3c6-4f2d-9429-5d0a6b716b5c.h5ad -O {RAW_DATA_DIR}/amb.h5ad

--2024-07-10 11:10:56--  https://datasets.cellxgene.cziscience.com/1de08aa2-b3c6-4f2d-9429-5d0a6b716b5c.h5ad
Resolving datasets.cellxgene.cziscience.com (datasets.cellxgene.cziscience.com)... 18.160.200.37, 18.160.200.32, 18.160.200.93, ...
Connecting to datasets.cellxgene.cziscience.com (datasets.cellxgene.cziscience.com)|18.160.200.37|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1298779249 (1.2G) [binary/octet-stream]
Saving to: ‘/work/magroup/shared/Heimdall/data/cell_type_annotation/raw/amb.h5ad’


2024-07-10 11:11:08 (106 MB/s) - ‘/work/magroup/shared/Heimdall/data/cell_type_annotation/raw/amb.h5ad’ saved [1298779249/1298779249]



In [32]:
# Load raw data
data = ad.read_h5ad(RAW_DATA_DIR / "amb.h5ad")
data

AnnData object with n_obs × n_vars = 22375 × 35304
    obs: 'donor_id', 'age_days', 'eye_condition', 'genotype', 'driver_lines', 'reporter_lines', 'brain_hemisphere', 'brain_region', 'brain_subregion', 'injection_label_direction', 'injection_primary', 'injection_secondary', 'injection_tract', 'injection_material', 'injection_exclusion_criterion', 'facs_date', 'facs_container', 'facs_sort_criteria', 'rna_amplification_set', 'rna_amplification_pcr_cycles', 'library_prep_set', 'library_prep_avg_size_bp', 'seq_tube', 'seq_batch', 'total_reads', 'percent_exon_reads', 'percent_intron_reads', 'percent_intergenic_reads', 'percent_rrna_reads', 'percent_mt_exon_reads', 'percent_reads_unique', 'percent_synth_reads', 'percent_ecoli_reads', 'percent_aligned_reads_total', 'complexity_cg', 'genes_detected_cpm_criterion', 'genes_detected_fpkm_criterion', 'tdt_cpm', 'gfp_cpm', 'BICCN_class_label', 'BICCN_subclass_label', 'cluster', 'confusion_score', 'cluster_correlation', 'core_intermediate_call', 'BI

In [33]:
# Filtering
sc.pp.filter_genes(data, min_cells=100, inplace=True)
data

AnnData object with n_obs × n_vars = 22375 × 23326
    obs: 'donor_id', 'age_days', 'eye_condition', 'genotype', 'driver_lines', 'reporter_lines', 'brain_hemisphere', 'brain_region', 'brain_subregion', 'injection_label_direction', 'injection_primary', 'injection_secondary', 'injection_tract', 'injection_material', 'injection_exclusion_criterion', 'facs_date', 'facs_container', 'facs_sort_criteria', 'rna_amplification_set', 'rna_amplification_pcr_cycles', 'library_prep_set', 'library_prep_avg_size_bp', 'seq_tube', 'seq_batch', 'total_reads', 'percent_exon_reads', 'percent_intron_reads', 'percent_intergenic_reads', 'percent_rrna_reads', 'percent_mt_exon_reads', 'percent_reads_unique', 'percent_synth_reads', 'percent_ecoli_reads', 'percent_aligned_reads_total', 'complexity_cg', 'genes_detected_cpm_criterion', 'genes_detected_fpkm_criterion', 'tdt_cpm', 'gfp_cpm', 'BICCN_class_label', 'BICCN_subclass_label', 'cluster', 'confusion_score', 'cluster_correlation', 'core_intermediate_call', 'BI

In [34]:
# Standardize attributes
data.obs.rename(columns={"BICCN_subclass_label": "celltype", "organism": "species"}, inplace=True)
data.var["gene_symbol"] = data.var["feature_name"]
data.var["gene_ensembl"] = data.var.index

le = LabelEncoder()

data.obs["task_celltype"] = le.fit_transform(data.obs["celltype"])
data.uns["celltype_order"] = le.classes_.tolist()

data

AnnData object with n_obs × n_vars = 22375 × 23326
    obs: 'donor_id', 'age_days', 'eye_condition', 'genotype', 'driver_lines', 'reporter_lines', 'brain_hemisphere', 'brain_region', 'brain_subregion', 'injection_label_direction', 'injection_primary', 'injection_secondary', 'injection_tract', 'injection_material', 'injection_exclusion_criterion', 'facs_date', 'facs_container', 'facs_sort_criteria', 'rna_amplification_set', 'rna_amplification_pcr_cycles', 'library_prep_set', 'library_prep_avg_size_bp', 'seq_tube', 'seq_batch', 'total_reads', 'percent_exon_reads', 'percent_intron_reads', 'percent_intergenic_reads', 'percent_rrna_reads', 'percent_mt_exon_reads', 'percent_reads_unique', 'percent_synth_reads', 'percent_ecoli_reads', 'percent_aligned_reads_total', 'complexity_cg', 'genes_detected_cpm_criterion', 'genes_detected_fpkm_criterion', 'tdt_cpm', 'gfp_cpm', 'BICCN_class_label', 'celltype', 'cluster', 'confusion_score', 'cluster_correlation', 'core_intermediate_call', 'BICCN_ontology

In [35]:
# Save processed data
data.write_h5ad(PROCESSED_DATA_DIR / "amb.h5ad")

## Mouse Pancreatic Islet (MPI)

https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE211799

In [36]:
!wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE211nnn/GSE211799/suppl/GSE211799%5Fadata%5Fatlas.h5ad.gz -O {RAW_DATA_DIR}/mpi.h5ad.gz
!gunzip {RAW_DATA_DIR}/mpi.h5ad.gz -d {RAW_DATA_DIR}

--2024-07-10 11:12:25--  https://ftp.ncbi.nlm.nih.gov/geo/series/GSE211nnn/GSE211799/suppl/GSE211799%5Fadata%5Fatlas.h5ad.gz
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.13, 130.14.250.12, 2607:f220:41e:250::12, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.13|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4257833437 (4.0G) [application/x-gzip]
Saving to: ‘/work/magroup/shared/Heimdall/data/cell_type_annotation/raw/mpi.h5ad.gz’


2024-07-10 11:13:18 (76.7 MB/s) - ‘/work/magroup/shared/Heimdall/data/cell_type_annotation/raw/mpi.h5ad.gz’ saved [4257833437/4257833437]

gzip: /work/magroup/shared/Heimdall/data/cell_type_annotation/raw is a directory -- ignored


In [37]:
# Load raw data
data = ad.read_h5ad(RAW_DATA_DIR / "mpi.h5ad")
data

AnnData object with n_obs × n_vars = 301796 × 31706
    obs: 'study_sample', 'study', 'file', 'reference', 'size_factors_sample', 'phase_cyclone', 's_cyclone', 'g2m_cyclone', 'g1_cyclone', 'sex', 'ins_score', 'ins_high', 'gcg_score', 'gcg_high', 'sst_score', 'sst_high', 'ppy_score', 'ppy_high', 'cell_filtering', 'age', 'strain', 'tissue', 'technique', 'study_sample_design', 'cell_type', 'cell_type_multiplet', 'cell_subtype', 'cell_subtype_multiplet', 'design', 'size_factors_integrated', 'pre_cell_type_unified', 'pre_cell_type_original', 'study_parsed', 'cell_type_parsed', 'low_q', 'BETA-DATA_leiden_r1.5', 'BETA-DATA_leiden_r20', 'BETA-DATA_hc_gene_programs', 'BETA-DATA_hc_gene_programs_parsed', 'BETA-DATA_leiden_r1.5_parsed', 'BETA-DATA_leiden_r1.5_parsed_const', 'CXG-DATA_n_genes', 'CXG-DATA_mt_frac', 'CXG-DATA_doublet_score', 'CXG-DATA_log10_n_counts', 'CXG-DATA_age_approxDays', 'CXG-DATA_cell_subtype_immune_reannotatedIntegrated', 'CXG-DATA_cell_subtype_endothelial_reannotatedIntegr

In [38]:
# Filtering
sc.pp.filter_genes(data, min_cells=100, inplace=True)
data

AnnData object with n_obs × n_vars = 301796 × 19888
    obs: 'study_sample', 'study', 'file', 'reference', 'size_factors_sample', 'phase_cyclone', 's_cyclone', 'g2m_cyclone', 'g1_cyclone', 'sex', 'ins_score', 'ins_high', 'gcg_score', 'gcg_high', 'sst_score', 'sst_high', 'ppy_score', 'ppy_high', 'cell_filtering', 'age', 'strain', 'tissue', 'technique', 'study_sample_design', 'cell_type', 'cell_type_multiplet', 'cell_subtype', 'cell_subtype_multiplet', 'design', 'size_factors_integrated', 'pre_cell_type_unified', 'pre_cell_type_original', 'study_parsed', 'cell_type_parsed', 'low_q', 'BETA-DATA_leiden_r1.5', 'BETA-DATA_leiden_r20', 'BETA-DATA_hc_gene_programs', 'BETA-DATA_hc_gene_programs_parsed', 'BETA-DATA_leiden_r1.5_parsed', 'BETA-DATA_leiden_r1.5_parsed_const', 'CXG-DATA_n_genes', 'CXG-DATA_mt_frac', 'CXG-DATA_doublet_score', 'CXG-DATA_log10_n_counts', 'CXG-DATA_age_approxDays', 'CXG-DATA_cell_subtype_immune_reannotatedIntegrated', 'CXG-DATA_cell_subtype_endothelial_reannotatedIntegr

In [42]:
# Standardize attributes
data.obs["species"] = "Mus musculus"
data.obs.rename(columns={"cell_type_integrated_v2": "celltype"}, inplace=True)
data.var.index.name = None
data.var["gene_ensembl"] = data.var.index

le = LabelEncoder()

data.obs["task_celltype"] = le.fit_transform(data.obs["celltype"])
data.uns["celltype_order"] = le.classes_.tolist()

data.obs["batch"] = le.fit_transform(data.obs["study_sample_design"])
data.uns["batch_order"] = le.classes_.tolist()

data

AnnData object with n_obs × n_vars = 301796 × 19888
    obs: 'study_sample', 'study', 'file', 'reference', 'size_factors_sample', 'phase_cyclone', 's_cyclone', 'g2m_cyclone', 'g1_cyclone', 'sex', 'ins_score', 'ins_high', 'gcg_score', 'gcg_high', 'sst_score', 'sst_high', 'ppy_score', 'ppy_high', 'cell_filtering', 'age', 'strain', 'tissue', 'technique', 'study_sample_design', 'cell_type', 'cell_type_multiplet', 'cell_subtype', 'cell_subtype_multiplet', 'design', 'size_factors_integrated', 'pre_cell_type_unified', 'pre_cell_type_original', 'study_parsed', 'cell_type_parsed', 'low_q', 'BETA-DATA_leiden_r1.5', 'BETA-DATA_leiden_r20', 'BETA-DATA_hc_gene_programs', 'BETA-DATA_hc_gene_programs_parsed', 'BETA-DATA_leiden_r1.5_parsed', 'BETA-DATA_leiden_r1.5_parsed_const', 'CXG-DATA_n_genes', 'CXG-DATA_mt_frac', 'CXG-DATA_doublet_score', 'CXG-DATA_log10_n_counts', 'CXG-DATA_age_approxDays', 'CXG-DATA_cell_subtype_immune_reannotatedIntegrated', 'CXG-DATA_cell_subtype_endothelial_reannotatedIntegr

In [44]:
# Save processed data
data.write_h5ad(PROCESSED_DATA_DIR / "mpi.h5ad")