# Preprocessing of the benchmarking datasets

In [1]:
import anndata as ad
import scanpy as sc
import numpy as np
import pandas as pd
import os

In [2]:
print(os.path.abspath(os.path.join(os.getcwd(), '..')))

/data2/yixuan/SCMBench/data


## Feature selection

In [3]:
for data_name in ['10x-Multiome-Pbmc10k','Chen-2019','Ma-2020','Muto-2021','Yao-2021']:  
    dir_path = os.path.abspath(os.path.join(os.getcwd(),'../download/'+data_name))  
    atac=sc.read_h5ad(os.path.join(dir_path,data_name+'-ATAC.h5ad'))
    rna=sc.read_h5ad(os.path.join(dir_path,data_name+'-RNA.h5ad'))
    print(data_name)
    print('atac:', atac)
    print('rna:', rna)
    sc.pp.highly_variable_genes(
            atac,
            flavor="seurat_v3",
            n_top_genes=80000,
            subset=True,
        )

    sc.pp.highly_variable_genes(
            rna,
            flavor="seurat_v3",
            n_top_genes=8000,
            subset=True,
        )
    atac.write_h5ad(os.path.join(dir_path,data_name+'-small-ATAC.h5ad'))
    rna.write_h5ad(os.path.join(dir_path,data_name+'-small-RNA.h5ad'))

10x-Multiome-Pbmc10k
atac: AnnData object with n_obs × n_vars = 9631 × 107194
    obs: 'domain', 'protocol', 'dataset', 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'nCount_ATAC', 'nFeature_ATAC', 'nCount_SCT', 'nFeature_SCT', 'SCT.weight', 'ATAC.weight', 'wsnn_res.0.8', 'seurat_clusters', 'sub.cluster', 'cell_type'
    var: 'feature_types', 'genome', 'chrom', 'chromStart', 'chromEnd', 'n_counts'
rna: AnnData object with n_obs × n_vars = 9631 × 29095
    obs: 'domain', 'protocol', 'dataset', 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'nCount_ATAC', 'nFeature_ATAC', 'nCount_SCT', 'nFeature_SCT', 'SCT.weight', 'ATAC.weight', 'wsnn_res.0.8', 'seurat_clusters', 'sub.cluster', 'cell_type'
    var: 'gene_ids', 'feature_types', 'genome', 'chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts', 'gene_type', 'gene_name', 'hgnc_id', 'havana_gene', 'tag', 'n_counts', 'highly_variable',

## Batch split

### Muto-2021

In [4]:
atac=sc.read_h5ad(os.path.abspath(os.path.join(os.getcwd(),'../download/Muto-2021/Muto-2021-ATAC.h5ad'))) 
rna=sc.read_h5ad(os.path.abspath(os.path.join(os.getcwd(),'../download/Muto-2021/Muto-2021-RNA.h5ad')))
print(rna.obs['batch'])
print(atac.obs['batch'])

cells
AAACCTGAGGGTCTCC-1    e0def004-9e30-4a3b-9a65-007110f3a1f2
AAACCTGAGTGTTAGA-1    e0def004-9e30-4a3b-9a65-007110f3a1f2
AAACCTGCAAGCGCTC-1    e0def004-9e30-4a3b-9a65-007110f3a1f2
AAACCTGCACCAGATT-1    e0def004-9e30-4a3b-9a65-007110f3a1f2
AAACCTGCAGTCAGAG-1    e0def004-9e30-4a3b-9a65-007110f3a1f2
                                      ...                 
TTTGTCAGTTAAGACA-5    8213a3f7-2437-4e8a-b836-caec33df901d
TTTGTCAGTTCCGGCA-5    8213a3f7-2437-4e8a-b836-caec33df901d
TTTGTCATCACAACGT-5    8213a3f7-2437-4e8a-b836-caec33df901d
TTTGTCATCACGCGGT-5    8213a3f7-2437-4e8a-b836-caec33df901d
TTTGTCATCCTGCTTG-5    8213a3f7-2437-4e8a-b836-caec33df901d
Name: batch, Length: 19985, dtype: category
Categories (5, object): ['8c570254-4bef-48d8-bd79-c812f60835a5', '5028f75a-8c09-4155-a232-ad7dbfa6042e', '8213a3f7-2437-4e8a-b836-caec33df901d', 'e0def004-9e30-4a3b-9a65-007110f3a1f2', 'f6c0f811-2fb8-4989-b796-37c14b055517']
cells
AAACGAAAGGTTGTTC-3    5028f75a-8c09-4155-a232-ad7dbfa6042e
AAACGAACAAA

In [5]:
dic={'8c570254-4bef-48d8-bd79-c812f60835a5':'1', '5028f75a-8c09-4155-a232-ad7dbfa6042e':'2', '8213a3f7-2437-4e8a-b836-caec33df901d':'3', 'e0def004-9e30-4a3b-9a65-007110f3a1f2':'4', 'f6c0f811-2fb8-4989-b796-37c14b055517':'5'}
for batch in ['8c570254-4bef-48d8-bd79-c812f60835a5', '5028f75a-8c09-4155-a232-ad7dbfa6042e', '8213a3f7-2437-4e8a-b836-caec33df901d', 'e0def004-9e30-4a3b-9a65-007110f3a1f2', 'f6c0f811-2fb8-4989-b796-37c14b055517']:
    temp_rna=rna[rna.obs['batch']==batch]
    temp_atac=atac[atac.obs['batch']==batch]
    sc.pp.highly_variable_genes(
        temp_atac,
        flavor="seurat_v3",
        n_top_genes=80000,
        subset=True,
    )
    sc.pp.highly_variable_genes(
            temp_rna,
            flavor="seurat_v3",
            n_top_genes=8000,
            subset=True,
        )

    temp_atac.write_h5ad(os.path.abspath(os.path.join(os.getcwd(),'../download/Muto-2021/Muto-2021-batch-'+dic[batch]+'-small-ATAC.h5ad')))
    temp_rna.write_h5ad(os.path.abspath(os.path.join(os.getcwd(),'../download/Muto-2021/Muto-2021-batch-'+dic[batch]+'-small-RNA.h5ad')))


  adata.uns["hvg"] = {"flavor": flavor}
  adata.uns["hvg"] = {"flavor": flavor}
  adata.uns["hvg"] = {"flavor": flavor}
  adata.uns["hvg"] = {"flavor": flavor}
  adata.uns["hvg"] = {"flavor": flavor}
  adata.uns["hvg"] = {"flavor": flavor}
  adata.uns["hvg"] = {"flavor": flavor}
  adata.uns["hvg"] = {"flavor": flavor}
  adata.uns["hvg"] = {"flavor": flavor}
  adata.uns["hvg"] = {"flavor": flavor}


In [6]:
import random
selected_rna=[]
selected_atac=[]
for batch in ['8c570254-4bef-48d8-bd79-c812f60835a5', '5028f75a-8c09-4155-a232-ad7dbfa6042e', '8213a3f7-2437-4e8a-b836-caec33df901d', 'e0def004-9e30-4a3b-9a65-007110f3a1f2', 'f6c0f811-2fb8-4989-b796-37c14b055517']:
    batch_list=rna[rna.obs['batch']==batch].obs.index.tolist()
    selected_rna.extend(random.sample(batch_list, 2000))
    print(rna[rna.obs['batch']==batch].obs.shape)
    batch_list=atac[atac.obs['batch']==batch].obs.index.tolist()
    selected_atac.extend(random.sample(batch_list, 2500))
    print(atac[atac.obs['batch']==batch].obs.shape)
print(len(selected_rna),len(selected_atac))

(3683, 37)
(4805, 33)
(5464, 37)
(6658, 33)
(3804, 37)
(5493, 33)
(4114, 37)
(3677, 33)
(2920, 37)
(3572, 33)
10000 12500


In [7]:
atac_small=atac[atac.obs.index.isin(selected_atac)]
rna_small=rna[rna.obs.index.isin(selected_rna)]
print(atac_small)
print(rna_small)

View of AnnData object with n_obs × n_vars = 12500 × 99019
    obs: 'assay_ontology_term_id', 'development_stage_ontology_term_id', 'donor_uuid', 'ethnicity_ontology_term_id', 'library_uuid', 'organism_ontology_term_id', 'sample_preservation_method', 'sample_uuid', 'suspension_type', 'suspension_uuid', 'tissue_ontology_term_id', 'is_primary_data', 'author_cell_type', 'cell_type_category', 'cell_type_ontology_term_id', 'author_cluster', 'disease_ontology_term_id', 'reported_diseases', 'sex_ontology_term_id', 'nCount_RNA', 'nFeature_RNA', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'ethnicity', 'development_stage', 'domain', 'protocol', 'dataset', 'batch'
    var: 'chrom', 'chromStart', 'chromEnd', 'genome', 'n_counts'
    uns: 'X_normalization', 'default_embedding', 'layer_descriptions', 'schema_version', 'title'
    obsm: 'X_umap'
View of AnnData object with n_obs × n_vars = 10000 × 27146
    obs: 'assay_ontology_term_id', 'development_stage_ontology_term_id', 'donor_

In [8]:

sc.pp.highly_variable_genes(
        rna_small,
        flavor="seurat_v3",
        n_top_genes=8000,
        subset=True,
    )
sc.pp.highly_variable_genes(
        atac_small,
        flavor="seurat_v3",
        n_top_genes=80000,
        subset=True,
        span=0.9
    )
print(atac_small,rna_small)
atac_small.write_h5ad(os.path.abspath(os.path.join(os.getcwd(),'../download/Muto-2021/Muto-2021-sampled-small-ATAC.h5ad')))
rna_small.write_h5ad(os.path.abspath(os.path.join(os.getcwd(),'../download/Muto-2021/Muto-2021-sampled-small-RNA.h5ad')))

  adata.uns["hvg"] = {"flavor": flavor}
  adata.uns["hvg"] = {"flavor": flavor}


AnnData object with n_obs × n_vars = 12500 × 80000
    obs: 'assay_ontology_term_id', 'development_stage_ontology_term_id', 'donor_uuid', 'ethnicity_ontology_term_id', 'library_uuid', 'organism_ontology_term_id', 'sample_preservation_method', 'sample_uuid', 'suspension_type', 'suspension_uuid', 'tissue_ontology_term_id', 'is_primary_data', 'author_cell_type', 'cell_type_category', 'cell_type_ontology_term_id', 'author_cluster', 'disease_ontology_term_id', 'reported_diseases', 'sex_ontology_term_id', 'nCount_RNA', 'nFeature_RNA', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'ethnicity', 'development_stage', 'domain', 'protocol', 'dataset', 'batch'
    var: 'chrom', 'chromStart', 'chromEnd', 'genome', 'n_counts', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'X_normalization', 'default_embedding', 'layer_descriptions', 'schema_version', 'title', 'hvg'
    obsm: 'X_umap' AnnData object with n_obs × n_vars = 10000 × 8000
    obs

### Ma-2020

In [9]:
atac=sc.read_h5ad(os.path.abspath(os.path.join(os.getcwd(),'../download/Ma-2020/Ma-2020-ATAC.h5ad')))
rna=sc.read_h5ad(os.path.abspath(os.path.join(os.getcwd(),'../download/Ma-2020/Ma-2020-RNA.h5ad')))
print(rna.obs['batch'])
print(atac.obs['batch'])

cells
R1.01.R2.01.R3.06.P1.55    55
R1.01.R2.03.R3.68.P1.55    55
R1.01.R2.05.R3.15.P1.53    53
R1.01.R2.05.R3.40.P1.55    55
R1.01.R2.05.R3.49.P1.55    55
                           ..
R1.92.R2.79.R3.05.P1.56    56
R1.93.R2.20.R3.18.P1.53    53
R1.93.R2.80.R3.62.P1.55    55
R1.93.R2.91.R3.82.P1.56    56
R1.94.R2.75.R3.40.P1.55    55
Name: batch, Length: 32231, dtype: category
Categories (4, object): ['53', '54', '55', '56']
cells
R1.01.R2.01.R3.06.P1.55    55
R1.01.R2.03.R3.68.P1.55    55
R1.01.R2.05.R3.15.P1.53    53
R1.01.R2.05.R3.40.P1.55    55
R1.01.R2.05.R3.49.P1.55    55
                           ..
R1.92.R2.79.R3.05.P1.56    56
R1.93.R2.20.R3.18.P1.53    53
R1.93.R2.80.R3.62.P1.55    55
R1.93.R2.91.R3.82.P1.56    56
R1.94.R2.75.R3.40.P1.55    55
Name: batch, Length: 32231, dtype: category
Categories (4, object): ['53', '54', '55', '56']


In [10]:
for batch in ['53','54','55','56']:
    temp_rna=rna[rna.obs['batch']==batch]
    temp_atac=atac[atac.obs['batch']==batch]
    sc.pp.highly_variable_genes(
        temp_atac,
        flavor="seurat_v3",
        n_top_genes=80000,
        subset=True,
    )
    sc.pp.highly_variable_genes(
            temp_rna,
            flavor="seurat_v3",
            n_top_genes=8000,
            subset=True,
        )

    temp_atac.write_h5ad(os.path.abspath(os.path.join(os.getcwd(),'../download/Ma-2020/Ma-2020-batch-'+batch+'-small-ATAC.h5ad')))
    temp_rna.write_h5ad(os.path.abspath(os.path.join(os.getcwd(),'../download/Ma-2020/Ma-2020-batch-'+batch+'-small-RNA.h5ad')))


  adata.uns["hvg"] = {"flavor": flavor}
  adata.uns["hvg"] = {"flavor": flavor}
  adata.uns["hvg"] = {"flavor": flavor}
  adata.uns["hvg"] = {"flavor": flavor}
  adata.uns["hvg"] = {"flavor": flavor}
  adata.uns["hvg"] = {"flavor": flavor}
  adata.uns["hvg"] = {"flavor": flavor}
  adata.uns["hvg"] = {"flavor": flavor}


In [11]:
import random
selected_rna=[]
selected_atac=[]
for batch in ['53','54','55','56']:
    batch_list=rna[rna.obs['batch']==batch].obs.index.tolist()
    selected_rna.extend(random.sample(batch_list, 2000))
    print(rna[rna.obs['batch']==batch].obs.shape)
    batch_list=atac[atac.obs['batch']==batch].obs.index.tolist()
    selected_atac.extend(random.sample(batch_list, 2500))
    print(atac[atac.obs['batch']==batch].obs.shape)
print(len(selected_rna),len(selected_atac))

(5692, 5)
(5692, 5)
(10709, 5)
(10709, 5)
(9903, 5)
(9903, 5)
(5927, 5)
(5927, 5)
8000 10000


In [12]:
atac_small=atac[atac.obs.index.isin(selected_atac)]
rna_small=rna[rna.obs.index.isin(selected_rna)]
print(atac_small)
print(rna_small)

View of AnnData object with n_obs × n_vars = 10000 × 340341
    obs: 'domain', 'protocol', 'dataset', 'cell_type', 'batch'
    var: 'chrom', 'chromStart', 'chromEnd', 'genome', 'n_counts'
View of AnnData object with n_obs × n_vars = 8000 × 21478
    obs: 'domain', 'protocol', 'dataset', 'cell_type', 'batch'
    var: 'chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts', 'gene_id', 'gene_type', 'mgi_id', 'havana_gene', 'tag', 'genome', 'n_counts', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'hvg'


In [13]:
sc.pp.highly_variable_genes(
        rna_small,
        flavor="seurat_v3",
        n_top_genes=8000,
        subset=True,
    )
sc.pp.highly_variable_genes(
        atac_small,
        flavor="seurat_v3",
        n_top_genes=80000,
        subset=True,
        span=0.9
    )
print(atac_small,rna_small)
atac_small.write_h5ad(os.path.abspath(os.path.join(os.getcwd(),'../download/Ma-2020/Ma-2020-sampled-small-ATAC.h5ad')))
rna_small.write_h5ad(os.path.abspath(os.path.join(os.getcwd(),'../download/Ma-2020/Ma-2020-sampled-small-RNA.h5ad')))


  adata.uns["hvg"] = {"flavor": flavor}
  adata.uns["hvg"] = {"flavor": flavor}


AnnData object with n_obs × n_vars = 10000 × 80000
    obs: 'domain', 'protocol', 'dataset', 'cell_type', 'batch'
    var: 'chrom', 'chromStart', 'chromEnd', 'genome', 'n_counts', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'hvg' AnnData object with n_obs × n_vars = 8000 × 8000
    obs: 'domain', 'protocol', 'dataset', 'cell_type', 'batch'
    var: 'chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts', 'gene_id', 'gene_type', 'mgi_id', 'havana_gene', 'tag', 'genome', 'n_counts', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'hvg'
