### Example code for generating subsampled BAMs for benchmarking analysis

In [1]:
import pandas as pd

In [2]:
supp_tables = "../data/Shukla.Levine.Gundem Supplementary Tables.xlsx"
sup8 = pd.read_excel(supp_tables, sheet_name=7, header=1)

In [3]:
samples = sup8[["Individual ID", "Original Coverage"]]

### Calculated fractions for subsampling for all potential target coverages

In [4]:
to_subsample = pd.DataFrame(columns=["ID","Full Coverage","Target Coverage","Fraction"])
for _, sample in samples.iterrows():
    coverage = sample["Original Coverage"]
    for tc in [100, 80, 60, 30]:
        if coverage > tc + 5:
            to_subsample = to_subsample.append(
                {
                    "ID": sample["Individual ID"],
                    "Full Coverage": sample["Original Coverage"],
                    "Target Coverage": tc,
                    "Fraction": tc / sample["Original Coverage"],
                },
                ignore_index=True,
            )

In [5]:
to_subsample.head()

Unnamed: 0,ID,Full Coverage,Target Coverage,Fraction
0,H133673,98,80,0.816327
1,H133673,98,60,0.612245
2,H133673,98,30,0.306122
3,H134753,105,80,0.761905
4,H134753,105,60,0.571429


### Code below is to demonstrate how BAMs were subsampled  to various levels. This was done on an HPC cluster using parallel jobs consisting of 4 tasks.

In [6]:
seed = '3423423'
for idx, row in to_subsample.iterrows():
    sid = row["ID"]
    target_cov = row["Target Coverage"]
    fraction = str(round(row["Fraction"], 4)).split(".")[1]
    
    # should be replaced with real bam path
    in_bam = f"{sid}.bam"
    out_bam = f"{sid}_{target_cov}X.bam"
    
    cmd = (
        f"samtools view -h -s {seed}.{fraction} -@4 {in_bam} -b > {out_bam}"
    )
    print(cmd)

samtools view -h -s 3423423.8163 -@4 H133673.bam -b > H133673_80X.bam
samtools view -h -s 3423423.6122 -@4 H133673.bam -b > H133673_60X.bam
samtools view -h -s 3423423.3061 -@4 H133673.bam -b > H133673_30X.bam
samtools view -h -s 3423423.7619 -@4 H134753.bam -b > H134753_80X.bam
samtools view -h -s 3423423.5714 -@4 H134753.bam -b > H134753_60X.bam
samtools view -h -s 3423423.2857 -@4 H134753.bam -b > H134753_30X.bam
samtools view -h -s 3423423.8955 -@4 H134754.bam -b > H134754_60X.bam
samtools view -h -s 3423423.4478 -@4 H134754.bam -b > H134754_30X.bam
samtools view -h -s 3423423.7407 -@4 H134755.bam -b > H134755_60X.bam
samtools view -h -s 3423423.3704 -@4 H134755.bam -b > H134755_30X.bam
samtools view -h -s 3423423.8791 -@4 H134756.bam -b > H134756_80X.bam
samtools view -h -s 3423423.6593 -@4 H134756.bam -b > H134756_60X.bam
samtools view -h -s 3423423.3297 -@4 H134756.bam -b > H134756_30X.bam
samtools view -h -s 3423423.8451 -@4 H134757.bam -b > H134757_60X.bam
samtools view -h -s 