### Notebook for the formatting and generation of joint `anndata` object for _Cai_2021_ TB data

- **Developed by:** Carlos Talavera-López Ph.D
- **Institute of Computational Biology - Computational Health Centre - Helmholtz Munich**
- v220408

### Rationale:

- This notebook aims at exploring the data generated by Cai _et al_ 2021, which focuses on human cells from PBMC samples in patients with different stages of TB.
- Samples were mapped using the human GENCODE reference with `bustools`.


### Import required modules

In [1]:
import anndata
import numpy as np
import pandas as pd
import scanpy as sc

### Set up working environment

In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 180, color_map = 'magma_r', dpi_save = 300, vector_friendly = True, format = 'svg')

-----
anndata     0.8.0
scanpy      1.9.1
-----
PIL                         9.0.1
asttokens                   NA
backcall                    0.2.0
beta_ufunc                  NA
binom_ufunc                 NA
bottleneck                  1.3.4
cffi                        1.15.0
cloudpickle                 2.0.0
colorama                    0.4.4
cycler                      0.10.0
cython_runtime              NA
cytoolz                     0.11.0
dask                        2022.02.1
dateutil                    2.8.2
debugpy                     1.5.1
decorator                   5.1.1
defusedxml                  0.7.1
executing                   0.8.3
fsspec                      2022.02.0
google                      NA
h5py                        3.6.0
ipykernel                   6.9.1
ipython_genutils            0.2.0
jedi                        0.18.1
jinja2                      2.11.3
joblib                      1.1.0
jupyter_server              1.13.5
kiwisolver                  1.3.2
l

### Samples with active TB

In [3]:
adata_1 = sc.read_h5ad('/home/cartalop/data/lung/tb/Cai2020/SRR11038989/counts_unfiltered/adata.h5ad')
adata_1.obs['study'] = 'CaiY_2021'
adata_1.obs['individual'] = 'SAMN14048025'
adata_1.obs['sample'] = 'PBMC_TB_3'
adata_1.obs['tissue'] = 'PBMC'
adata_1.obs['donor'] = 'SAMN14048025'
adata_1.obs['age'] = '34'
adata_1.obs['gender'] = 'male'
adata_1.obs['status'] = 'active_TB'
adata_1.obs['data_type'] = 'scRNAseq'
adata_1.obs['centre'] = 'Shenzhen University'
adata_1.obs['version'] = '10XV2'
adata_1

AnnData object with n_obs × n_vars = 465705 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version'
    var: 'gene_name'

In [4]:
adata_2 = sc.read_h5ad('/home/cartalop/data/lung/tb/Cai2020/SRR11038990/counts_unfiltered/adata.h5ad')
adata_2.obs['study'] = 'CaiY_2021'
adata_2.obs['individual'] = 'SAMN14048024'
adata_2.obs['sample'] = 'PBMC_TB_2'
adata_2.obs['tissue'] = 'PBMC'
adata_2.obs['donor'] = 'SAMN14048024'
adata_2.obs['age'] = '35'
adata_2.obs['gender'] = 'male'
adata_2.obs['status'] = 'active_TB'
adata_2.obs['data_type'] = 'scRNAseq'
adata_2.obs['centre'] = 'Shenzhen University'
adata_2.obs['version'] = '10XV2'
adata_2

AnnData object with n_obs × n_vars = 619020 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version'
    var: 'gene_name'

In [5]:
adata_3 = sc.read_h5ad('/home/cartalop/data/lung/tb/Cai2020/SRR11038991/counts_unfiltered/adata.h5ad')
adata_3.obs['study'] = 'CaiY_2021'
adata_3.obs['individual'] = 'SAMN14048023'
adata_3.obs['sample'] = 'PBMC_TB_1'
adata_3.obs['tissue'] = 'PBMC'
adata_3.obs['donor'] = 'SAMN14048023'
adata_3.obs['age'] = '33'
adata_3.obs['gender'] = 'female'
adata_3.obs['status'] = 'active_TB'
adata_3.obs['data_type'] = 'scRNAseq'
adata_3.obs['centre'] = 'Shenzhen University'
adata_3.obs['version'] = '10XV2'
adata_3

AnnData object with n_obs × n_vars = 490097 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version'
    var: 'gene_name'

### Samples with latent TB

In [6]:
adata_4 = sc.read_h5ad('/home/cartalop/data/lung/tb/Cai2020/SRR11038992/counts_unfiltered/adata.h5ad')
adata_4.obs['study'] = 'CaiY_2021'
adata_4.obs['individual'] = 'SAMN14048022'
adata_4.obs['sample'] = 'PBMC_LTBI_2'
adata_4.obs['tissue'] = 'PBMC'
adata_4.obs['donor'] = 'SAMN14048022'
adata_4.obs['age'] = '51'
adata_4.obs['gender'] = 'female'
adata_4.obs['status'] = 'latent_TB'
adata_4.obs['data_type'] = 'scRNAseq'
adata_4.obs['centre'] = 'Shenzhen University'
adata_4.obs['version'] = '10XV2'
adata_4

AnnData object with n_obs × n_vars = 531457 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version'
    var: 'gene_name'

In [7]:
adata_5 = sc.read_h5ad('/home/cartalop/data/lung/tb/Cai2020/SRR11038993/counts_unfiltered/adata.h5ad')
adata_5.obs['study'] = 'CaiY_2021'
adata_5.obs['individual'] = 'SAMN14048021'
adata_5.obs['sample'] = 'PBMC_LTBI_1'
adata_5.obs['tissue'] = 'PBMC'
adata_5.obs['donor'] = 'SAMN14048021'
adata_5.obs['age'] = '51'
adata_5.obs['gender'] = 'female'
adata_5.obs['status'] = 'latent_TB'
adata_5.obs['data_type'] = 'scRNAseq'
adata_5.obs['centre'] = 'Shenzhen University'
adata_5.obs['version'] = '10XV2'
adata_5

AnnData object with n_obs × n_vars = 551750 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version'
    var: 'gene_name'

### Samples from Healthy patients

In [8]:
adata_6 = sc.read_h5ad('/home/cartalop/data/lung/tb/Cai2020/SRR11038994/counts_unfiltered/adata.h5ad')
adata_6.obs['study'] = 'CaiY_2021'
adata_6.obs['individual'] = 'SAMN14048020'
adata_6.obs['sample'] = 'PBMC_HC_2'
adata_6.obs['tissue'] = 'PBMC'
adata_6.obs['donor'] = 'SAMN14048020'
adata_6.obs['age'] = '30'
adata_6.obs['gender'] = 'male'
adata_6.obs['status'] = 'Healthy'
adata_6.obs['data_type'] = 'scRNAseq'
adata_6.obs['centre'] = 'Shenzhen University'
adata_6.obs['version'] = '10XV2'
adata_6

AnnData object with n_obs × n_vars = 524019 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version'
    var: 'gene_name'

In [9]:
adata_7 = sc.read_h5ad('/home/cartalop/data/lung/tb/Cai2020/SRR11038995/counts_unfiltered/adata.h5ad')
adata_7.obs['study'] = 'CaiY_2021'
adata_7.obs['individual'] = 'SAMN14048019'
adata_7.obs['sample'] = 'PBMC_HC_1'
adata_7.obs['tissue'] = 'PBMC'
adata_7.obs['donor'] = 'SAMN14048019'
adata_7.obs['age'] = '26'
adata_7.obs['gender'] = 'male'
adata_7.obs['status'] = 'Healthy'
adata_7.obs['data_type'] = 'scRNAseq'
adata_7.obs['centre'] = 'Shenzhen University'
adata_7.obs['version'] = '10XV2'
adata_7

AnnData object with n_obs × n_vars = 519223 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version'
    var: 'gene_name'

### Concatenate all datasets into a single object

In [19]:
CaiY2021_TB = adata_1.concatenate(adata_2, adata_3, adata_4, adata_5, adata_6, adata_7, batch_key = 'batch', batch_categories = ['acTB3', 'acTB2', 'acTB1', 'ltTB2', 'ltTB3', 'H2', 'H1'], join = 'inner')
CaiY2021_TB

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


AnnData object with n_obs × n_vars = 3701271 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch'
    var: 'gene_name'

### Fix gene names

In [20]:
CaiY2021_TB.var.head()

Unnamed: 0_level_0,gene_name
gene_id,Unnamed: 1_level_1
ENSG00000223972.5,DDX11L1
ENSG00000227232.5,WASH7P
ENSG00000278267.1,MIR6859-1
ENSG00000243485.5,MIR1302-2HG
ENSG00000284332.1,MIR1302-2


In [23]:
CaiY2021_TB.var['ensembl_id'] = CaiY2021_TB.var.index.copy()
CaiY2021_TB.var.set_index('gene_name', inplace = True)
CaiY2021_TB.var.head()

Unnamed: 0_level_0,ensembl_id
gene_name,Unnamed: 1_level_1
DDX11L1,ENSG00000223972.5
WASH7P,ENSG00000227232.5
MIR6859-1,ENSG00000278267.1
MIR1302-2HG,ENSG00000243485.5
MIR1302-2,ENSG00000284332.1


In [24]:
CaiY2021_TB

AnnData object with n_obs × n_vars = 3701271 × 61533
    obs: 'study', 'individual', 'sample', 'tissue', 'donor', 'age', 'gender', 'status', 'data_type', 'centre', 'version', 'batch'
    var: 'ensembl_id'

### Export object

In [25]:
CaiY2021_TB.write('/home/cartalop/data/lung/tb/Cai2020/CaiY2020_PBMC_TB.raw.h5ad')