In [1]:
# lpr_scregseg environment

# Combine data for extended treatment-control use case as part of revisions r1
We created an identical feature set for the ATAC data as we previously used for the additional data.

Here, we combine all preprocessed data sets into a single anndata object.

In [2]:
import os
import pandas as pd
from anndata import read_h5ad
import scanpy as sc
import scregseg
import matplotlib.pyplot as plt
import numpy as np

In [3]:
adata_dict = {}

In [4]:
adata_dict['treatment_control_use_case'] = read_h5ad("liam_manuscript_reproducibility/data/derived/Mimitou2021/DOGMA_seq/preprocessed_DOGMA.h5ad")

In [5]:
adata_dict['perm_cells'] = read_h5ad("liam_manuscript_reproducibility/data/derived/Swanson2021/perm_cells/preprocessed_perm_cells.h5ad")

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [6]:
adata_dict['10k_sorted_nuclei'] = read_h5ad("liam_manuscript_reproducibility/data/derived/10x/10k_sorted/preprocessed_10k_sorted_nuclei.h5ad")

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [7]:
adata_dict['perm_cells'].var_names_make_unique()

In [8]:
adata_dict['10k_sorted_nuclei'].var_names_make_unique()

In [9]:
adata_dict.keys()

dict_keys(['treatment_control_use_case', 'perm_cells', '10k_sorted_nuclei'])

In [10]:
# Combine datasets
adata_concat =  sc.AnnData.concatenate(*[adata_dict[key] for key in adata_dict.keys()],
                                           join='inner', batch_key='data_set',
                                           batch_categories=adata_dict.keys(),
                                           uns_merge=None, index_unique='_', fill_value=None)

In [11]:
adata_concat.obs.columns

Index(['batch', 'counts', 'original_barcodes', 'n_fragments', 'n_duplicate',
       'n_mito', 'n_unique', 'altius_count', 'altius_frac',
       'gene_bodies_count', 'gene_bodies_frac', 'peaks_count', 'peaks_frac',
       'tss_count', 'tss_frac', 'cell_name', 'well_id', 'chip_id', 'batch_id',
       'pbmc_sample_id', 'DoubletScore', 'DoubletEnrichment', 'TSSEnrichment',
       'celltype', 'broad_celltype', 'pass_rnaQC', 'pass_accQC', 'data_set'],
      dtype='object')

In [12]:
adata_concat

AnnData object with n_obs × n_vars = 44580 × 36588
    obs: 'batch', 'counts', 'original_barcodes', 'n_fragments', 'n_duplicate', 'n_mito', 'n_unique', 'altius_count', 'altius_frac', 'gene_bodies_count', 'gene_bodies_frac', 'peaks_count', 'peaks_frac', 'tss_count', 'tss_frac', 'cell_name', 'well_id', 'chip_id', 'batch_id', 'pbmc_sample_id', 'DoubletScore', 'DoubletEnrichment', 'TSSEnrichment', 'celltype', 'broad_celltype', 'pass_rnaQC', 'pass_accQC', 'data_set'
    var: 'gene_ids', 'feature_types', 'genome-10k_sorted_nuclei', 'genome-perm_cells'
    obsm: 'ATAC'

In [13]:
adata_concat.obs

Unnamed: 0,batch,counts,original_barcodes,n_fragments,n_duplicate,n_mito,n_unique,altius_count,altius_frac,gene_bodies_count,...,batch_id,pbmc_sample_id,DoubletScore,DoubletEnrichment,TSSEnrichment,celltype,broad_celltype,pass_rnaQC,pass_accQC,data_set
AAACAGCCATAGCGAG-1_DOGMA_DIG_CTRL_treatment_control_use_case,DOGMA_DIG_CTRL,942.0,,,,,,,,,...,,,,,,,,,,treatment_control_use_case
AAACAGCCATCCATCT-1_DOGMA_DIG_CTRL_treatment_control_use_case,DOGMA_DIG_CTRL,3439.0,,,,,,,,,...,,,,,,,,,,treatment_control_use_case
AAACAGCCATTGACAT-1_DOGMA_DIG_CTRL_treatment_control_use_case,DOGMA_DIG_CTRL,1773.0,,,,,,,,,...,,,,,,,,,,treatment_control_use_case
AAACATGCAAAGCGCA-1_DOGMA_DIG_CTRL_treatment_control_use_case,DOGMA_DIG_CTRL,3826.0,,,,,,,,,...,,,,,,,,,,treatment_control_use_case
AAACATGCACCTAATG-1_DOGMA_DIG_CTRL_treatment_control_use_case,DOGMA_DIG_CTRL,1979.0,,,,,,,,,...,,,,,,,,,,treatment_control_use_case
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGGTGACATGC-1_10k_sorted_nuclei,10k_sorted_nuclei,,,,,,,,,,...,,,,,,naive CD8 T cells,Lymphoid,True,True,10k_sorted_nuclei
TTTGTTGGTGTTAAAC-1_10k_sorted_nuclei,10k_sorted_nuclei,,,,,,,,,,...,,,,,,naive CD8 T cells,Lymphoid,True,True,10k_sorted_nuclei
TTTGTTGGTTAGGATT-1_10k_sorted_nuclei,10k_sorted_nuclei,,,,,,,,,,...,,,,,,CD56 (bright) NK cells,Lymphoid,True,True,10k_sorted_nuclei
TTTGTTGGTTGGTTAG-1_10k_sorted_nuclei,10k_sorted_nuclei,,,,,,,,,,...,,,,,,memory CD4 T cells,Lymphoid,True,True,10k_sorted_nuclei


In [14]:
adata_concat.write("liam_manuscript_reproducibility/data/derived/Mimitou2021/DOGMA_seq/extended_treatment_control_use_case_revisions_r1.h5ad")

... storing 'batch' as categorical
... storing 'original_barcodes' as categorical
... storing 'cell_name' as categorical
