In [10]:
import os, pathlib, sys
from urllib import request

import scanpy as sc
#import scvi
import numpy as np
import pandas as pd

## Download data

We can download the data from the [developmental cell atlas portal](https://developmental.cellatlas.io/fetal-immune).
Here I downloaded the `HSC/progenitor cells` dataset because it's quite small.

In [12]:
anndata_path = "https://cellgeni.cog.sanger.ac.uk/developmentcellatlas/fetal-immune/PAN.A01.v01.raw_count.20210429.HSC_PROGENITORS.embedding.h5ad"
download_path = "../../data/processed/hsc_progenitors.h5ad"
request.urlretrieve(anndata_path, download_path)

('../../data/processed/hsc_progenitors.h5ad',
 <http.client.HTTPMessage at 0x1cefd6a9c70>)

## EDA

Here I am exploring the contents of the downloaded `.h5ad` file.

In [13]:
ad = sc.read_h5ad("../../data/processed/hsc_progenitors.h5ad")
ad

AnnData object with n_obs × n_vars = 40422 × 33538
    obs: 'n_counts', 'n_genes', 'file', 'mito', 'doublet_scores', 'predicted_doublets', 'old_annotation_uniform', 'organ', 'Sort_id', 'age', 'method', 'donor', 'sex', 'Sample', 'scvi_clusters', 'is_maternal_contaminant', 'anno_lvl_2_final_clean', 'celltype_annotation'
    var: 'GeneID', 'GeneName', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'scvi_model_var'
    uns: 'leiden', 'scvi', 'umap'
    obsm: 'X_scvi', 'X_umap'
    obsp: 'scvi_connectivities', 'scvi_distances'

In [17]:
ad.obs.head().T

index,FCAImmP7579224-TCTTTCCCAAGCCGTC,FCAImmP7579224-GGAATAATCAGGTAAA,FCAImmP7579224-TAGCCGGGTACCATCA,FCAImmP7579224-GCATGCGCAGCTGCAC,FCAImmP7579224-GATGCTATCTCTGCTG
n_counts,28616.0,20510.0,18168.0,14823.0,12947.0
n_genes,5176,4007,3903,3796,3095
file,FCAImmP7579224,FCAImmP7579224,FCAImmP7579224,FCAImmP7579224,FCAImmP7579224
mito,0.035994,0.044174,0.062087,0.046414,0.019927
doublet_scores,0.152941,0.13369,0.110588,0.164087,0.294118
predicted_doublets,False,False,False,False,False
old_annotation_uniform,B CELL,B CELL,B CELL,HSC,B CELL
organ,SK,SK,SK,SK,SK
Sort_id,CD45P,CD45P,CD45P,CD45P,CD45P
age,12,12,12,12,12


In [26]:
print(f"Number of donors: {ad.obs.donor.nunique()}")
print(f"Number of annotated cell types: {ad.obs.celltype_annotation.nunique()}")

Number of donors: 25
Number of annotated cell types: 16


In [32]:
ad.obs.groupby("sex").donor.nunique()

sex
female    17
male       6
nan        2
Name: donor, dtype: int64

In [27]:
ad.obs.celltype_annotation.value_counts()

DN(P)_T          9810
EARLY_MK         4740
GMP              3862
HSC_MPP          3403
PRE_PRO_B        3019
CYCLING_MPP      2374
PROMONOCYTE      2324
CMP              1972
MEMP             1936
DN(early)_T      1670
MEP              1408
LMPP_MLP         1395
CYCLING_MEMP      922
PROMYELOCYTE      834
DOUBLET_ERY_B     751
nan                 2
Name: celltype_annotation, dtype: int64