# Single Cell Atlas
We look at some data from https://www.covid19cellatlas.org/
with the goal of comparing healthy and patient donors.

In [1]:
!ls data/atlas

20200917_MGH_Broad_Villani_to_CZI.h5ad
Single_cell_atlas_of_peripheral_immune_response_to_SARS_CoV_2_infection.h5ad
ucl-sanger_covid_airway.submit_to_czi.cellxgene.20200911.h5ad
vieira19_Alveoli_and_parenchyma_anonymised.processed.h5ad


In [2]:
from anndata import read_h5ad
import numpy as np
import pandas as pd

In [3]:
def gene_expression_count(t, gene_name):
    i = t.var.T.columns.get_loc(gene_name)
    e = t.X[:,i]
    return len(e.nonzero()[0])

In [4]:
t = read_h5ad('data/atlas/vieira19_Alveoli_and_parenchyma_anonymised.processed.h5ad')
u = read_h5ad('data/atlas/ucl-sanger_covid_airway.submit_to_czi.cellxgene.20200911.h5ad')
u2 = read_h5ad('data/atlas/20200917_MGH_Broad_Villani_to_CZI.h5ad')
u3 = read_h5ad('data/atlas/Single_cell_atlas_of_peripheral_immune_response_to_SARS_CoV_2_infection.h5ad')

In [5]:
gene_expression_count(t, 'TMPRSS2')

1794

In [6]:
gene_expression_count(u, 'TMPRSS2')

2360

In [7]:
gene_expression_count(u3, 'TMPRSS2')

83

# Scratchpad

In [8]:
t

AnnData object with n_obs × n_vars = 12971 × 33694
    obs: 'Sample', 'Donor', 'Source', 'Location', 'CellType', 'BroadCellType'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'leiden', 'neighbors_hm', 'pca'
    obsm: 'X_umap_hm'
    varm: 'PCs'

In [9]:
t.X

<12971x33694 sparse matrix of type '<class 'numpy.float32'>'
	with 29602881 stored elements in Compressed Sparse Column format>

In [10]:
t.obs

Unnamed: 0_level_0,Sample,Donor,Source,Location,CellType,BroadCellType
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LungTranscriptome7239220_LungTranscriptome7239220ACGGAGATCAAGGTAA-0,LungTranscriptome7239220_LungTranscriptome7239220,298C,Transplant,Alveoli and parenchyma,Ciliated 1,Ciliated
LungTranscriptome7135919_GGCGTGTCATACCATG-0,LungTranscriptome7135919,290B,Transplant,Alveoli and parenchyma,Ciliated 1,Ciliated
LungTranscriptome7135920_GCGACCATCAGTTAGC-0,LungTranscriptome7135920,290B,Transplant,Alveoli and parenchyma,Basal 2,Basal
LungTranscriptome7239219_LungTranscriptome7239219CATTCGCAGCCGCCTA-0,LungTranscriptome7239219_LungTranscriptome7239219,298C,Transplant,Alveoli and parenchyma,Ciliated 1,Ciliated
LungTranscriptome7135920_TCAACGAGTATCTGCA-0,LungTranscriptome7135920,290B,Transplant,Alveoli and parenchyma,Ciliated 1,Ciliated
...,...,...,...,...,...,...
LungTranscriptome7239219_LungTranscriptome7239219GGGCATCCACCAACCG-1,LungTranscriptome7239219_LungTranscriptome7239219,298C,Transplant,Alveoli and parenchyma,Mast cells,Mast cells
LungTranscriptome7239219_LungTranscriptome7239219TGCTGCTAGCAATCTC-1,LungTranscriptome7239219_LungTranscriptome7239219,298C,Transplant,Alveoli and parenchyma,Mast cells,Mast cells
LungTranscriptome7239213_GACTGCGTCCTGCCAT-1,LungTranscriptome7239213,292B,Transplant,Alveoli and parenchyma,Mast cells,Mast cells
LungTranscriptome7239218_GCTGCGACATTCACTT-1,LungTranscriptome7239218,302C,Transplant,Alveoli and parenchyma,Mast cells,Mast cells


In [11]:
t.var

Unnamed: 0_level_0,highly_variable,means,dispersions,dispersions_norm
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
RP11-34P13.3,False,1.000000e-12,,
FAM138A,False,1.000000e-12,,
OR4F5,False,1.000000e-12,,
RP11-34P13.7,False,4.336628e-03,1.440831,0.521987
RP11-34P13.8,False,1.000000e-12,,
...,...,...,...,...
AC233755.2,False,1.913299e-04,0.909056,-1.162769
AC233755.1,False,1.261966e-03,1.266680,-0.029754
AC240274.1,False,7.457980e-02,1.375593,0.315302
AC213203.1,False,1.000000e-12,,


In [12]:
t.var.T

index,RP11-34P13.3,FAM138A,OR4F5,RP11-34P13.7,RP11-34P13.8,RP11-34P13.14,RP11-34P13.9,FO538757.3,FO538757.2,AP006222.2,...,AC007325.2,BX072566.1,AL354822.1,AC023491.2,AC004556.1,AC233755.2,AC233755.1,AC240274.1,AC213203.1,FAM231B
highly_variable,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
means,1e-12,1e-12,1e-12,0.00433663,1e-12,1e-12,1e-12,0.000505906,0.473574,0.343604,...,0.03061,1e-12,0.0107444,1e-12,0.0394213,0.00019133,0.00126197,0.0745798,1e-12,1e-12
dispersions,,,,1.44083,,,,1.35848,1.29746,1.14623,...,1.09921,,1.23467,,1.46588,0.909056,1.26668,1.37559,,
dispersions_norm,,,,0.521987,,,,0.261073,0.353819,-0.707995,...,-0.560324,,-0.13116,,0.601342,-1.16277,-0.029754,0.315302,,


In [13]:
t.var.T['TMPRSS2']

highly_variable       False
means               0.35141
dispersions         1.17911
dispersions_norm   -0.50413
Name: TMPRSS2, dtype: object

In [14]:
t.var.T.columns.get_loc('TMPRSS2')

33499

In [15]:
t.X

<12971x33694 sparse matrix of type '<class 'numpy.float32'>'
	with 29602881 stored elements in Compressed Sparse Column format>

In [16]:
gene_expression = t.X[:,33499]

In [17]:
gene_expression

<12971x1 sparse matrix of type '<class 'numpy.float32'>'
	with 1794 stored elements in Compressed Sparse Column format>

In [18]:
gene_expression.nonzero()

(array([    3,    17,    18, ..., 12913, 12927, 12945], dtype=int32),
 array([0, 0, 0, ..., 0, 0, 0], dtype=int32))

In [19]:
a, b = gene_expression.nonzero()

In [20]:
pd.DataFrame(gene_expression[3])

Unnamed: 0,0
0,"(0, 0)\t1.534591"


In [21]:
pd.DataFrame(gene_expression[3]*2)

Unnamed: 0,0
0,"(0, 0)\t3.069182"


In [22]:
len(a)

1794

In [23]:
u.obs

Unnamed: 0_level_0,SampleID,Age,Sex,Race,Ethnicity,BMI,HeartDisease,LungDisease,KidneyDisease,Diabetes,...,TypeOfSample,n_counts,n_genes,percent_hb,percent_mito,percent_ribo,percent_top50,Annotation,Institute,ObjectCreateDate
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCTGAGATGCGAC-CV001_KM8853698,AP1-NB,50-60,M,Black,Not Hispanic or Latino,30.0-39.9 (obese),No,No,No,No,...,Nasal brush,21189.0,4816,0.014158,1.873614,8.688471,31.322856,Secretory 1,UCL-SANGER,20200804
AAACCTGCAACACGCC-CV001_KM8853698,AP1-NB,50-60,M,Black,Not Hispanic or Latino,30.0-39.9 (obese),No,No,No,No,...,Nasal brush,9462.0,3532,0.010569,1.754386,11.086452,23.451702,Secretory 1,UCL-SANGER,20200804
AAACCTGCACACGCTG-CV001_KM8853698,AP1-NB,50-60,M,Black,Not Hispanic or Latino,30.0-39.9 (obese),No,No,No,No,...,Nasal brush,2595.0,1345,0.038536,3.121387,9.055877,29.980732,Secretory 1,UCL-SANGER,20200804
AAACCTGTCCTACAGA-CV001_KM8853698,AP1-NB,50-60,M,Black,Not Hispanic or Latino,30.0-39.9 (obese),No,No,No,No,...,Nasal brush,5105.0,2192,0.039177,10.342801,0.705191,38.707150,Secretory 2,UCL-SANGER,20200804
AAACGGGAGTGAAGTT-CV001_KM8853698,AP1-NB,50-60,M,Black,Not Hispanic or Latino,30.0-39.9 (obese),No,No,No,No,...,Nasal brush,1789.0,1055,0.055897,0.279486,5.869201,29.122415,Secretory 1,UCL-SANGER,20200804
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTCACAGACAAGC-CV001_KM8854513,AP5-NB,50-60,F,Asian,Not Hispanic or Latino,Unknown,No,No,No,No,...,Nasal brush,3295.0,1868,0.000000,2.579666,7.162367,21.274659,Ciliated 1,UCL-SANGER,20200804
TTTGTCATCAGATAAG-CV001_KM8854513,AP5-NB,50-60,F,Asian,Not Hispanic or Latino,Unknown,No,No,No,No,...,Nasal brush,8014.0,2787,0.024956,1.734465,7.686549,31.707013,Secretory 3,UCL-SANGER,20200804
TTTGTCATCAGCTCTC-CV001_KM8854513,AP5-NB,50-60,F,Asian,Not Hispanic or Latino,Unknown,No,No,No,No,...,Nasal brush,6386.0,2812,0.015659,1.424992,6.921391,24.851237,Ciliated 1,UCL-SANGER,20200804
TTTGTCATCCCATTTA-CV001_KM8854513,AP5-NB,50-60,F,Asian,Not Hispanic or Latino,Unknown,No,No,No,No,...,Nasal brush,2216.0,1219,0.000000,1.850181,8.167871,28.971119,Secretory 1,UCL-SANGER,20200804


In [24]:
u2.var.columns

Index(['gene_ids'], dtype='object')

In [25]:
u2.var

Unnamed: 0_level_0,gene_ids
var_names,Unnamed: 1_level_1
MIR1302-2HG,ENSG00000243485
AL627309.1,ENSG00000238009
AL627309.3,ENSG00000239945
AL669831.2,ENSG00000229905
AL669831.5,ENSG00000237491
...,...
AL354822.1,ENSG00000278384
AC004556.1,ENSG00000276345
AC233755.2,ENSG00000277856
AC233755.1,ENSG00000275063


In [26]:
u2.var.T.columns.get_loc('AL627309.1')
u2.var[u2.var['gene_ids'] == 'ENSG00000238009']

Unnamed: 0_level_0,gene_ids
var_names,Unnamed: 1_level_1
AL627309.1,ENSG00000238009


In [27]:
cols = [col for col in u2.var.T.columns if 'TMPRSS2' in col]

In [28]:
cols

[]

In [29]:
tmprss2 = 'ENSG00000184012'

In [30]:
u2.var[u2.var['gene_ids'] == tmprss2]

Unnamed: 0_level_0,gene_ids
var_names,Unnamed: 1_level_1


In [36]:
ts = t.var.T.columns
len(ts)

33694

In [37]:
us = u.var.T.columns
len(us)

33421

In [40]:
tus = ts.intersection(us)
len(tus)

22045