# Merging datasets into a single h5ad file

In [1]:
import scanpy as sc
import glob
import os
import anndata as ad
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
from scipy.io import mmread
import doubletdetection
import gzip

def read_sc_data(
    counts_file, 
    features_file,
    metadata_file
):
    data_dict = {}
    for key, filename in zip(
        ['counts', 'features', 'metadata'],
        [counts_file, features_file, metadata_file]
    ):
        if filename.endswith('gz'):
            open_file = lambda x: gzip.open(x, 'rt')
            
        else: 
            open_file = lambda x: open(x, 'r')
            
        with open_file(filename) as file:
            if key == 'counts':
                # transpose due to the way the data was exported to comply with Seurat
                # see also convert_to_raw.ipynb
                data = mmread(file).T.tocsr()
                
            elif key == 'metadata':
                data = pd.read_csv(
                    file,
                    sep = '\t',
                    index_col = 0
                )
            
            else:
                data = pd.DataFrame(
                    index = file.read().rstrip().split()
                )
            
        data_dict[key] = data

    adata = ad.AnnData(
        X = data_dict['counts'],
        obs = data_dict['metadata'],
        var = data_dict['features'],
        dtype = np.int64
    )
    return adata

def detect_doublets(sample_adata, n_jobs = 1, n_iters = 20, random_state = 0, **kwargs):
    sc.pp.filter_genes(
        sample_adata, 
        min_cells = 1
    )
    
    clf = doubletdetection.BoostClassifier(
        n_iters = n_iters,
        clustering_algorithm = "louvain",
        standard_scaling = True,
        pseudocount = 0.1,
        n_jobs = n_jobs,
        random_state = random_state,
        verbose = False
    )
    doublet = clf.fit(sample_adata.X).predict(**kwargs)
    doublet_frame = pd.DataFrame(
        {
            'doublet': doublet,
            'doublet_score': clf.doublet_score()
        },
        index = sample_adata.obs.index
    )
    print(
        '{}: predicted {} / {} cells as doublets'.format(
            sample_id,
            int(doublet.sum()),
            len(doublet)
        )
    )
    del clf, sample_adata
    
    return doublet_frame, int(doublet.sum()), len(doublet)

In [None]:
sirolimus_trial_data = sc.read_h5ad(
    '../raw/srcx_sirolimus.h5ad',
)
sirolimus_trial_baseline = sirolimus_trial_data[sirolimus_trial_data.obs.timepoint == 'baseline_general'].copy()
sirolimus_trial_baseline.obs.drop(
    ['timepoint', 'visit_id'],
    axis = 'columns',
    inplace = True
)
sirolimus_trial_baseline.obs['status'] = 'sarcoidosis'
sirolimus_trial_baseline.obs['cell_fraction'] = 'None'
sirolimus_trial_baseline

In [None]:
sirolimus_missing_baseline = sc.read_h5ad(
    '../raw/srcx_missing_baseline_samples.h5ad'
)
sirolimus_missing_baseline.obs.drop(
    ['visit_id'],
    axis = 'columns',
    inplace = True
)
sirolimus_missing_baseline.obs['status'] = 'sarcoidosis'
sirolimus_missing_baseline.obs['cell_fraction'] = 'None'
sirolimus_missing_baseline

In [None]:
psoriasis = sc.read_10x_h5(
    '../raw/ps_cellranger.filtered.h5'
)
metadata = pd.read_csv(
    '../raw/ps_cellranger.metadata.tsv',
    sep='\t',
    index_col=0
)
psoriasis.obs = psoriasis.obs.merge(
    metadata,
    left_index=True,
    right_index=True,
    how='left'
)
psoriasis

In [None]:
psoriasis.obs.rename(
    columns = {
        'Status': 'status',
        'Tissue': 'tissue',
        'Cell_fraction': 'cell_fraction'
    },
    inplace = True
)
psoriasis.var_names_make_unique()
psoriasis

In [None]:
hiv = sc.read_h5ad(
    '../raw/hiv.h5ad'
)
hiv = hiv[hiv.obs.num_features == 1].copy()
hiv.obs.drop(
    ['num_features', 'num_umis'],
    axis = 'columns',
    inplace = True
)
hiv.obs.rename(
    columns = {
        'feature_call': 'tissue'
    },
    inplace = True
)
hiv.obs['status'] = 'normal'
hiv.obs['cell_fraction'] = 'None'
hiv.var.index = [var_name for var_name in hiv.var.gene_name.values]
hiv.obs.loc[:, 'sample_id'] = hiv.obs[['sample_id', 'tissue']].apply(
    lambda x: '_'.join(x.to_list()),
    axis = 1
)
hiv.var_names_make_unique()
hiv

In [None]:
# remove hiv samples 
hiv = hiv[hiv.obs.patient_id.isin(['SSHIV35_1', 'SSHIV35_2', 'SSHIV35_4'])].copy()
hiv

In [None]:
ps_pbmc = sc.read_h5ad(
    '../raw/ps_pbmc.h5ad'
)
ps_pbmc = ps_pbmc[ps_pbmc.obs.num_features == 1].copy()
ps_pbmc.obs.drop(
    ['num_umis', 'num_features'],
    axis = 'columns',
    inplace = True
)
ps_pbmc.obs.rename(
    columns = {
        'feature_call': 'sample_id'
    },
    inplace = True
)
ps_pbmc.obs['status'] = 'psoriasis'
ps_pbmc.obs['tissue'] = 'blood'
ps_pbmc.obs['patient_id'] = ps_pbmc.obs.sample_id
ps_pbmc.obs['cell_fraction'] = 'None'
ps_pbmc.var.index = [var_name for var_name in ps_pbmc.var.gene_name]
ps_pbmc.var_names_make_unique()
ps_pbmc

In [None]:
uc = sc.read_10x_h5(
    '../raw/uc_cellranger.filtered.h5'
)
metadata = pd.read_csv(
    '../raw/uc_cellranger.metadata.tsv',
    sep='\t',
    index_col=0
)
uc.obs = uc.obs.merge(
    metadata,
    left_index=True,
    right_index=True,
    how='left'
)
uc.var_names_make_unique()
uc

In [None]:
uc.obs.sample_id.unique()

In [None]:
map_string = '''
    GSM3576396\tC9_R_10x_scRNA
    GSM3576397\tC12_R_10x_scRNA
    GSM3576398\tC16_R_10x_scRNA
    GSM3576399\tU4_R_10x_scRNA
    GSM3576400\tU5_R_10x_scRNA
    GSM3576401\tU34_R_10x_scRNA
    GSM3576402\tU35_R_10x_scRNA
    GSM3576403\tU41_R_10x_scRNA
    GSM3576404\tU44_R_10x_scRNA
    GSM3576405\tU45_R_10x_scRNA
    GSM3576406\tC17_R_10x_scRNA
    GSM3576407\tC18_R_10x_scRNA
    GSM3576408\tC19_R_10x_scRNA
    GSM3576409\tC21_R_10x_scRNA
    GSM3576410\tC30_R_10x_scRNA
    GSM3576411\tC12_PBMC_10x_scRNA
    GSM3576412\tC16_PBMC_10x_scRNA
    GSM3576413\tU4_PBMC_10x_scRNA
    GSM3576414\tU5_PBMC_10x_scRNA
    GSM3576415\tU34_PBMC_10x_scRNA
    GSM3576416\tU35_PBMC_10x_scRNA
    GSM3576417\tU41_PBMC_10x_scRNA
    GSM3576418\tU44_PBMC_10x_scRNA
    GSM3576419\tU45_PBMC_10x_scRNA
    GSM3576420\tC17_PBMC_10x_scRNA
    GSM3576421\tC18_PBMC_10x_scRNA
    GSM3576422\tC19_PBMC_10x_scRNA
    GSM3576423\tC21_PBMC_10x_scRNA
    GSM3576424\tC30_PBMC_10x_scRNA
    GSM3576425\tC33_PBMC_10x_scRNA
    GSM3576426\tC17_I_10x_scRNA
    GSM3576427\tC18_I_10x_scRNA
    GSM3576428\tC19_I_10x_scRNA
    GSM3576429\tC21_I_10x_scRNA
    GSM3576430\tC30_I_10x_scRNA
    GSM3576431\tC33_I_10x_scRNA
'''

gsm_to_patient_id = {}
for sample_string in map_string.split('\n'):
    if not sample_string:
        continue
        
    sample_id, patient_string = sample_string.strip().split('\t')
    patient_id = patient_string.split('_')[0]
    gsm_to_patient_id[sample_id] = patient_id

gsm_to_patient_id

In [None]:
uc.obs.rename(
    columns = {
        'Status': 'status',
        'Tissue': 'tissue'
    },
    inplace = True
)
uc.obs['patient_id'] = uc.obs.sample_id.apply(
    lambda x: gsm_to_patient_id[x]
)
uc.obs['cell_fraction'] = 'None'
uc

In [None]:
adata = ad.concat(
    [
        psoriasis,
        ps_pbmc,
        hiv,
        sirolimus_missing_baseline,
        sirolimus_trial_baseline,
        uc
    ]
)
adata.obs_names_make_unique()
adata

In [None]:
# apparently this can't be run in the background (aka starting and closing the window)
# thus you have to keep it open until it finishes (suspect the loading bar to be the culprit)

doublet_frames, doublet_counts, cell_counts, labels = [], [], [], []
for sample_id in adata.obs.sample_id.unique():    
    print(sample_id)
    sample_adata = adata[adata.obs.sample_id == sample_id].copy()
    
    if sample_adata.shape[0] < 30:
        ndoublets, ncells = 0, sample_adata.shape[0]
        index = sample_adata.obs.index
        doublet_frame = pd.DataFrame(
            {
                'doublet': [False for i in index],
                'doublet_score': [0 for i in index]
            },
            index = index
        )
        
    else:
        doublet_frame, ndoublets, ncells = detect_doublets(
            sample_adata, 
            n_jobs = 1,
            n_iters = 20,
            random_state = 18247,
            voter_thresh = 0.5
        )
        
    for val, container in zip([doublet_frame, ndoublets, ncells, sample_id], [doublet_frames, doublet_counts, cell_counts, labels]):
        container.append(val)

In [None]:
df = pd.concat(doublet_frames)
df.loc[:, 'doublet'] = df.doublet.astype(bool)
df

In [None]:
adata.obs = adata.obs.merge(
    df,
    right_index=True,
    left_index=True,
    how='left'
)
adata

In [None]:
fig, ax = plt.subplots()
labels = [
    label + f' ({ndoublets} / {ncells})' for ndoublet, ncells, label in zip(doublet_counts, cell_counts, labels)
]
ax.bar(
    range(1, len(labels) + 1),
    np.array(doublet_counts) / np.array(cell_counts) * 100,
    tick_label = labels
)
ax.set_ylabel('percent doublets')
fig.set_figheight(10)
fig.set_figwidth(20)
fig.tight_layout()

In [None]:
adata.write_h5ad(
    '../data/inflammatory_disease.h5ad'
)

In [2]:
adata = sc.read_h5ad(
    '../data/inflammatory_disease.h5ad'
)
adata

AnnData object with n_obs × n_vars = 927252 × 33538
    obs: 'sample_id', 'patient_id', 'status', 'tissue', 'cell_fraction', 'doublet', 'doublet_score'

In [4]:
adata.obs.tissue.unique()

['dermis', 'epidermis', 'blood', 'skin', 'Blood', 'Skin_L', 'Skin_NL', 'PBMC', 'rectum', 'ileum']
Categories (10, object): ['Blood', 'PBMC', 'Skin_L', 'Skin_NL', ..., 'epidermis', 'ileum', 'rectum', 'skin']

In [7]:
adata.obs.status.unique()

['normal', 'atopic eczema', 'psoriasis', 'sarcoidosis', 'UC', 'control']
Categories (6, object): ['UC', 'atopic eczema', 'control', 'normal', 'psoriasis', 'sarcoidosis']

In [10]:
adata[adata.obs.status.isin(['normal', 'control']) & adata.obs.tissue.isin(['blood', 'Blood', 'PBMC'])].write_h5ad(
    '../data/blood.control.h5ad'
)