# Single Cell Atlas
We look at some data from https://www.covid19cellatlas.org/
with the goal of comparing healthy and patient donors.

In [1]:
!ls data/atlas

In [2]:
import h5py

In [3]:
f = h5py.File('data/atlas/vieira19_Alveoli_and_parenchyma_anonymised.processed.h5ad', 'r')
list(f.keys())

['X', 'obs', 'obsm', 'uns', 'var', 'varm']

In [4]:
from anndata import read_h5ad

In [5]:
t = read_h5ad('data/atlas/vieira19_Alveoli_and_parenchyma_anonymised.processed.h5ad')

In [6]:
t

AnnData object with n_obs × n_vars = 12971 × 33694
    obs: 'Sample', 'Donor', 'Source', 'Location', 'CellType', 'BroadCellType'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'leiden', 'neighbors_hm', 'pca'
    obsm: 'X_umap_hm'
    varm: 'PCs'

In [7]:
t.X

<12971x33694 sparse matrix of type '<class 'numpy.float32'>'
	with 29602881 stored elements in Compressed Sparse Column format>

In [8]:
t.obs

Unnamed: 0_level_0,Sample,Donor,Source,Location,CellType,BroadCellType
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LungTranscriptome7239220_LungTranscriptome7239220ACGGAGATCAAGGTAA-0,LungTranscriptome7239220_LungTranscriptome7239220,298C,Transplant,Alveoli and parenchyma,Ciliated 1,Ciliated
LungTranscriptome7135919_GGCGTGTCATACCATG-0,LungTranscriptome7135919,290B,Transplant,Alveoli and parenchyma,Ciliated 1,Ciliated
LungTranscriptome7135920_GCGACCATCAGTTAGC-0,LungTranscriptome7135920,290B,Transplant,Alveoli and parenchyma,Basal 2,Basal
LungTranscriptome7239219_LungTranscriptome7239219CATTCGCAGCCGCCTA-0,LungTranscriptome7239219_LungTranscriptome7239219,298C,Transplant,Alveoli and parenchyma,Ciliated 1,Ciliated
LungTranscriptome7135920_TCAACGAGTATCTGCA-0,LungTranscriptome7135920,290B,Transplant,Alveoli and parenchyma,Ciliated 1,Ciliated
...,...,...,...,...,...,...
LungTranscriptome7239219_LungTranscriptome7239219GGGCATCCACCAACCG-1,LungTranscriptome7239219_LungTranscriptome7239219,298C,Transplant,Alveoli and parenchyma,Mast cells,Mast cells
LungTranscriptome7239219_LungTranscriptome7239219TGCTGCTAGCAATCTC-1,LungTranscriptome7239219_LungTranscriptome7239219,298C,Transplant,Alveoli and parenchyma,Mast cells,Mast cells
LungTranscriptome7239213_GACTGCGTCCTGCCAT-1,LungTranscriptome7239213,292B,Transplant,Alveoli and parenchyma,Mast cells,Mast cells
LungTranscriptome7239218_GCTGCGACATTCACTT-1,LungTranscriptome7239218,302C,Transplant,Alveoli and parenchyma,Mast cells,Mast cells


In [9]:
t.var

Unnamed: 0_level_0,highly_variable,means,dispersions,dispersions_norm
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RP11-34P13.3,False,1.000000e-12,,
FAM138A,False,1.000000e-12,,
OR4F5,False,1.000000e-12,,
RP11-34P13.7,False,4.336628e-03,1.440831,0.521987
RP11-34P13.8,False,1.000000e-12,,
...,...,...,...,...
AC233755.2,False,1.913299e-04,0.909056,-1.162769
AC233755.1,False,1.261966e-03,1.266680,-0.029754
AC240274.1,False,7.457980e-02,1.375593,0.315302
AC213203.1,False,1.000000e-12,,


In [10]:
t.var.T

index,RP11-34P13.3,FAM138A,OR4F5,RP11-34P13.7,RP11-34P13.8,RP11-34P13.14,RP11-34P13.9,FO538757.3,FO538757.2,AP006222.2,...,AC007325.2,BX072566.1,AL354822.1,AC023491.2,AC004556.1,AC233755.2,AC233755.1,AC240274.1,AC213203.1,FAM231B
highly_variable,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
means,1e-12,1e-12,1e-12,0.00433663,1e-12,1e-12,1e-12,0.000505906,0.473574,0.343604,...,0.03061,1e-12,0.0107444,1e-12,0.0394213,0.00019133,0.00126197,0.0745798,1e-12,1e-12
dispersions,,,,1.44083,,,,1.35848,1.29746,1.14623,...,1.09921,,1.23467,,1.46588,0.909056,1.26668,1.37559,,
dispersions_norm,,,,0.521987,,,,0.261073,0.353819,-0.707995,...,-0.560324,,-0.13116,,0.601342,-1.16277,-0.029754,0.315302,,


In [11]:
t.var.T['TMPRSS2']

highly_variable       False
means               0.35141
dispersions         1.17911
dispersions_norm   -0.50413
Name: TMPRSS2, dtype: object

In [12]:
t.var.T.columns.get_loc('TMPRSS2')

33499

In [13]:
t.X

<12971x33694 sparse matrix of type '<class 'numpy.float32'>'
	with 29602881 stored elements in Compressed Sparse Column format>

In [14]:
gene_expression = t.X[:,33499]

In [15]:
gene_expression

<12971x1 sparse matrix of type '<class 'numpy.float32'>'
	with 1794 stored elements in Compressed Sparse Column format>

In [16]:
gene_expression.nonzero()

(array([    3,    17,    18, ..., 12913, 12927, 12945], dtype=int32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int32))

In [17]:
a, b = gene_expression.nonzero()

In [18]:
import numpy as np
import pandas as pd

In [19]:
pd.DataFrame(gene_expression[3])

Unnamed: 0,0
0,"(0, 0)\t1.534591"


In [20]:
pd.DataFrame(gene_expression[3]*2)

Unnamed: 0,0
0,"(0, 0)\t3.069182"


In [21]:
len(a)

1794

In [22]:
u = read_h5ad('data/atlas/ucl-sanger_covid_airway.submit_to_czi.cellxgene.20200911.h5ad')

In [23]:
u.X

<10550x33421 sparse matrix of type '<class 'numpy.float32'>'
	with 26021320 stored elements in Compressed Sparse Row format>

In [24]:
u.var.T.columns.get_loc('TMPRSS2')

33241

In [25]:
u_gene_expression = t.X[:,33241]

In [26]:
u_a, u_b = u_gene_expression.nonzero()

In [27]:
len(u_a)

0