# Prepare scATAC Testdata 

This notebook was used to pre-process and subset the ATAC-seq test data. The raw files were downloaded from 10X Datasets. More information can be found in the metadata HTML files [Metadata 10x PBMC v2](10k_pbmc_ATACv2_nextgem_Chromium_Controller_web_summary.html), [Metadata 10x PBMC v1.1](10k_pbmc_ATACv1p1_nextgem_Chromium_X_web_summary.html), which are provided within this directory.

Briefly, both datasets are merged at matrix and BAM file levels, and the user defines the regions and number of cells to keep. At matrix level, the peak/cell matrix provided by 10x is converted to bins, which allows for better overlap of features between the datasets. The notebook will then subset the data.

## Load dependencies

In [None]:
import scanpy as sc
import pandas as pd
import os
import scipy as scp
import numpy as np
import glob
import pysam
import anndata as ad

import pyranges as pr
from scipy.sparse import coo_matrix, csr_matrix

import sctoolbox
import sctoolbox.utils as utils
import sctoolbox.tools as tools

## General Settings

In [None]:
path_mtx = "/mnt/workspace2/jdetlef/experimental/inspect_testdata/10k_PBMC-selection/"

header = None
barcode_index = 0
genes_index = 0
delimiter = "\t"

## Make unique index from bedfile

In [None]:
def peaks_to_bins(adata, chromsizes, bin_size=5000):

    # 1) Prepare your peaks DataFrame with an explicit index
    adata.var["peak_idx"] = np.arange(adata.var.shape[0])

    chrom_lengths = pd.read_csv(chromsizes, sep='\t')
    chrom_lengths.set_index('id', inplace=True)
    chrom_lengths.pop('length.1')
    chrom_lengths = chrom_lengths.to_dict()['length']

    bins = []
    for chrom, length in chrom_lengths.items():
        for start in range(0, length, bin_size):
            bins.append((chrom, start, min(start+bin_size, length)))
    bins_df = pd.DataFrame(bins, columns=["Chromosome","Start","End"])
    bins_df["bin_idx"] = np.arange(bins_df.shape[0])

    # 2) Turn into PyRanges (include the index columns)
    gr_peaks = pr.PyRanges(adata.var)
    gr_bins  = pr.PyRanges(bins_df)

    # 3) Join to get bin↔peak overlaps (will carry bin_idx & peak_idx)
    overlap = gr_bins.join(gr_peaks).df
    # overlap columns include: Chromosome, Start, End, bin_idx, Start_b, End_b, peak_idx

    # 4) Build the sparse “peak→bin” matrix M
    rows = overlap["peak_idx"].values
    cols = overlap["bin_idx"].values
    data = np.ones_like(rows, dtype=np.int8)
    M = coo_matrix((data, (rows, cols)),
                   shape=(adata.n_vars, bins_df.shape[0])).tocsr()

    # 5) Multiply to get cell×bin counts
    X_bins = adata.X.dot(M)  # sparse CSR result

    # 6) Wrap into a new AnnData and save
    var_bins = bins_df.set_index("bin_idx")[["Chromosome","Start","End"]]
    var_bins.index = [f"{c}:{s}-{e}" for c,s,e in var_bins[["Chromosome","Start","End"]].itertuples(index=False)]
    adata_bins = ad.AnnData(X=X_bins, obs=adata.obs.copy(), var=var_bins)
    adata = adata_bins

    return adata


def make_unique_index(path):
    subdirs = next(os.walk(path_mtx))[1]
    for directory in subdirs:
        subdir = os.path.join(path, directory)
        peaks_bed = os.path.join(subdir, 'peaks.bed')
        output = os.path.join(subdir, 'peaks.tsv')
        peaks_from_bed = pd.read_csv(peaks_bed, header=header, delimiter=delimiter)

        custom_index = []

        for row in peaks_from_bed.iterrows():
            custom_index.append(f'{row[1][0]}:{row[1][1]}-{row[1][2]}')

        peaks_from_bed['index'] = custom_index
        peaks_from_bed.set_index('index', inplace=True)
        peaks_from_bed.to_csv(output, header=False, sep='\t')
        
# make_unique_index(path_mtx)

## Input filenames adata

In [None]:
# adjust in case of different naming schemes for any of the input files

mtx = '*matrix.mtx*'  # pattern for the file that contains counts
barcodes = '*barcodes.tsv*'  # pattern for the file that contains barcode information
variables = '*peaks.tsv*'  # pattern for the optional file that contains variable information

In [None]:
# Load anndata object
adata = utils.assemblers.from_mtx(path_mtx, mtx=mtx, barcodes=barcodes, variables=variables, variables_index=genes_index, header=None, var_error=False)
adata

In [None]:
# rename var columns
adata.var.rename(columns={'1':'Chromosome', '2':'Start', '3':'End'}, inplace=True)

In [None]:
# Convert peaks to bins to join datasets
adata = peaks_to_bins(adata, chromsizes='/mnt/flatfiles/organisms/new_organism/homo_sapiens/109/homo_sapiens.109.chrom.sizes', bin_size=5000)

In [None]:
# rename barcodes to rule out duplicates between samples
mapping = {}
batched_barcodes = []
for row in adata.obs.iterrows():
    barcode = row[0]
    batch = row[1]['batch']
    barcode_raw = barcode.split('-')[0]
    batched_barcodes.append(barcode_raw + '-' + batch)
    
    if row[1][1] not in mapping:
        mapping[row[1][1]] = batch
    
adata.obs['batched_barcodes'] = batched_barcodes
adata.obs.set_index('batched_barcodes', inplace=True)

In [None]:
# add batched barcodes to the bamfiles
def tag_batch(in_bam, out_bam, batch_id):
    bam_in  = pysam.AlignmentFile(in_bam,  "rb")
    bam_out = pysam.AlignmentFile(out_bam, "wb", template=bam_in)
    for read in bam_in.fetch(until_eof=True):
        if read.has_tag("CB"):
            old_cb = read.get_tag("CB")
            raw_bc = old_cb.split('-')[0]
            new_cb = f"{raw_bc}-{batch_id}"
            read.set_tag("CB", new_cb)  # pysam will infer the Z type
        bam_out.write(read)
    bam_in.close()
    bam_out.close()

## Apply to both samples
#bamfile_path = "/mnt/workspace2/jdetlef/experimental/inspect_testdata/10k_PBMC-selection/bamfiles"
#print(f'adding batch information to {bamfile_path}')
#tag_batch(os.path.join(bamfile_path, "10k_pbmc_ATACv1p1_nextgem_Chromium_X_possorted_bam.bam"), os.path.join(bamfile_path,"10k_pbmc_ATACv1p1_nextgem_Chromium_X_batched_bam.bam"), 1)
#print("10k_pbmc_ATACv1p1_nextgem_Chromium_X_possorted_bam.bam DONE")
#tag_batch(os.path.join(bamfile_path, "10k_pbmc_ATACv2_nextgem_Chromium_Controller_possorted_bam.bam"), os.path.join(bamfile_path, "10k_pbmc_ATACv2_nextgem_Chromium_Controller_batched_bam.bam"), 0)
#print("10k_pbmc_ATACv2_nextgem_Chromium_Controller_possorted_bam.bam DONE")

## NOW MERGE WITH SAMTOOLS --> COMMAND LINE

## Subset adata 

In [None]:
# Subset n_cells randomly selected
# Subset by chromosomes
n_cells = 80000
chroms_to_keep= ['chr1', 'chr2']

# ,'chr10', 'chr11', 'chr12','chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr1', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY'

barcode_sample = np.random.choice(adata.obs.index, size=n_cells)

obs_mask = adata.obs.index.isin(barcode_sample)
var_mask = adata.var['Chromosome'].isin(chroms_to_keep)

subdata = adata[obs_mask, var_mask]

subdata = subdata[np.sum((subdata.X > 0), axis=1) > 100]
subdata = subdata[:, subdata.X.sum(axis=0) > 20]
subdata = subdata[np.sum((subdata.X > 0), axis=1) > 100]
subdata = subdata[:, subdata.X.sum(axis=0) > 20]

subdata

In [None]:
subdata.obs

In [None]:
subdata.var

In [None]:
np.mean(np.sum(subdata.X > 0, axis=1))

## Save Barcodes Subset

In [None]:
# Save barcodes of the subsetted adata 
barcodes = list(subdata.obs.index)

df = pd.DataFrame({"barcode": barcodes})
# By default pandas writes a header and index; you can disable both:
df.to_csv(
    "scf_testdata/10k_PBMCs_merged_barcodes.tsv",
    sep="\t",
    index=False,      # don't write row numbers
    header=False      # omit the column name if you just want raw barcodes
)

## Save Peaks Subset

In [None]:
# Save Peaks / Bins 
subdata.var.to_csv(
    "scf_testdata/10k_PBMCs_merged_peaks.tsv",
    sep="\t",
    index=False,      # don't write row numbers
    header=False      # omit the column name if you just want raw barcodes
)

## Save Subset Fragments

In [None]:
# Load Fragments 
fragments_path = "/mnt/workspace2/jdetlef/experimental/inspect_testdata/10k_PBMC-selection/fragments"
PBMC_v1 = '10k_pbmc_ATACv1p1_nextgem_Chromium_X_fragments.tsv.gz'
PBMC_v2 = '10k_pbmc_ATACv2_nextgem_Chromium_Controller_fragments.tsv.gz'

PBMC_v1 = os.path.join(fragments_path, PBMC_v1)
PBMC_v2 = os.path.join(fragments_path, PBMC_v2)

PBMC_v1_fragments = pd.read_csv(PBMC_v1, header=header, delimiter=delimiter, comment='#')
PBMC_v2_fragments = pd.read_csv(PBMC_v2, header=header, delimiter=delimiter, comment='#')

In [None]:
mapping

In [None]:
# Add Batch information 
def batch_fragments(fragments, batch):
    
    batched_barcodes = []

    for row in fragments.iterrows():
        raw_barcode = row[1][3].split('-')[0]
        batched_barcodes.append(raw_barcode + '-' + str(batch))

    fragments[3] = batched_barcodes

    return fragments

print('processing')
PBMC_v1_fragments = batch_fragments(PBMC_v1_fragments, 1)
print('DONE:1')
PBMC_v2_fragments = batch_fragments(PBMC_v2_fragments, 0)
print('DONE:2')

In [None]:
# Concatenate fragments
fragments = pd.concat([PBMC_v1_fragments, PBMC_v2_fragments], ignore_index=True)
fragments.to_csv(os.path.join(fragments_path, "combined_fragments.tsv"), sep="\t", index=False, header=False)

#fragments = pd.read_csv(os.path.join(fragments_path, "combined_fragments.tsv"), sep="\t", header=None)
#fragments

In [None]:
chroms_to_keep = ['chr1']
# Subset fragments
sub_fragments = fragments[fragments[3].isin(barcodes)]
sub_fragments = sub_fragments[sub_fragments[0].isin(chroms_to_keep)]
sub_fragments.reset_index(inplace=True)
sub_fragments.pop('index')
sub_fragments

In [None]:
n_fragments_bc = 200

sampling = []
for bc, grp in sub_fragments.groupby(3):
    selection = grp.iloc[np.random.choice(np.arange(len(grp)), n_fragments_bc)]
    sampling.append(selection)
    
sub_fragments = pd.concat(sampling)

In [None]:
# Save fragments
sub_fragments.to_csv(
    os.path.join(fragments_path, "subdata_fragments.tsv"),
    sep="\t",
    index=False,      # don't write row numbers
    header=False      # omit the column name if you just want raw barcodes
)

In [None]:
subdata

## Save Subset BAM

In [None]:
bamfile = '/mnt/workspace2/jdetlef/experimental/inspect_testdata/10k_PBMC-selection/bamfiles/10k_pbmc_sorted.bam'
subdata_bam = '/mnt/workspace2/jdetlef/experimental/inspect_testdata/scf_testdata/10k_PBMCs_merged_bam.bam'

In [None]:
bam_in = bamfile
bam_out = subdata_bam
read_tag = "CB"
pysam_threads = 4,
overwrite = False

chroms_to_keep= ['chr1']

# Subset merged bam from above based on the subsetted adata

# check then load modules
utils.checker.check_module("tqdm")
if utils.jupyter._is_notebook() is True:
    from tqdm import tqdm_notebook as tqdm
else:
    from tqdm import tqdm
utils.checker.check_module("pysam")

# Create output dir if needed
utils.io.create_dir(bam_out)

# Open files
bam_in_obj = tools.bam.open_bam(bam_in, mode="rb", verbosity=0)
bam_out_obj = tools.bam.open_bam(bam_out, mode="wb",verbosity=0, template=bam_in_obj)

barcodes = set(barcodes)

# Update progress based on total number of reads
total = tools.bam.get_bam_reads(bam_in_obj)
print(' ', end='', flush=True)  # hack for making progress bars work in notebooks; https://github.com/tqdm/tqdm/issues/485#issuecomment-473338308
pbar_reading = tqdm(total=total, desc="Reading... ", unit="reads")
pbar_writing = tqdm(total=total, desc="% written from input", unit="reads")
step = int(total / 10000)  # 10000 total updates

# Iterate over reads
writing_i = 0
reading_i = 0
written = 0
    
for chrom in chroms_to_keep:
    for read in bam_in_obj.fetch(chrom):
        if read.has_tag(read_tag) and read.get_tag(read_tag) in barcodes:
            bam_out_obj.write(read)
            written += 1
            writing_i += 1
            if writing_i == step:
                pbar_writing.update(step)
                pbar_writing.refresh()
                writing_i = 0

        reading_i += 1

        # Update step manually - there is an overhead to update per read with hundreds of million reads
        if reading_i == step:
            pbar_reading.update(step)
            pbar_reading.refresh()
            reading_i = 0

    
# close progressbars
pbar_reading.close()
pbar_writing.close()

# Close bamfiles
bam_in_obj.close()
bam_out_obj.close()
#logger.info(f"Wrote {written} reads to output bam")

In [None]:
# Index Bam before processing
subdata_bam = '/mnt/workspace2/jdetlef/experimental/inspect_testdata/scf_testdata/10k_PBMCs_merged_sorted.bam'
read_tag = "CB"
pysam_threads = 4,
overwrite = False
n_reads = 250

chroms_to_keep= ['chr1']

# Subset bam to limit reads per barcode
bam_in = subdata_bam
directory = os.path.split(bam_in)[0]
bam_out = os.path.join(directory, '10k_PBMCs_sampled_bam.bam')

# Open files
bam_in_obj = tools.bam.open_bam(bam_in, mode="rb", verbosity=0)
bam_out_obj = tools.bam.open_bam(bam_out, mode="wb",verbosity=0, template=bam_in_obj)

barcodes = set(barcodes)

# Update progress based on total number of reads
total = tools.bam.get_bam_reads(bam_in_obj)
print(' ', end='', flush=True)  # hack for making progress bars work in notebooks; https://github.com/tqdm/tqdm/issues/485#issuecomment-473338308
pbar_reading = tqdm(total=total, desc="Reading... ", unit="reads")
pbar_writing = tqdm(total=total, desc="% written from input", unit="reads")
step = int(total / 10000)  # 10000 total updates

# Iterate over reads
writing_i = 0
reading_i = 0
written = 0

read_dict = {}
for read in bam_in_obj.fetch(chrom):
    read_bc = read.get_tag(read_tag)
    if read_bc not in read_dict:
        read_dict[read_bc] = 1
    else:
        read_dict[read_bc] += 1
    if read_dict[read_bc] <= n_reads:
        bam_out_obj.write(read)
        written += 1
        writing_i += 1
        if writing_i == step:
            pbar_writing.update(step)
            pbar_writing.refresh()
            writing_i = 0

    reading_i += 1

    # Update step manually - there is an overhead to update per read with hundreds of million reads
    if reading_i == step:
        pbar_reading.update(step)
        pbar_reading.refresh()
        reading_i = 0

    
# close progressbars
pbar_reading.close()
pbar_writing.close()

# Close bamfiles
bam_in_obj.close()
bam_out_obj.close()
#logger.info(f"Wrote {written} reads to output bam")

In [None]:
subdata.var

## Save subdata

In [None]:
subdata.obs.rename(columns={'batch':'sample'}, inplace=True)

In [None]:
subdata_barcodes = list(subdata.obs.index)
samples = []

for barcode in subdata_barcodes:
    samples.append(barcode.split('-')[1])

In [None]:
samples

In [None]:
subdata.write("scf_testdata/10k_PBMCs_merged_adata.h5ad")