## Find scWGS L1 breakpoints that intersect SLAVseq peaks

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import pyranges as pr
from gzip import open as gopen
from itertools import filterfalse, product
from collections import defaultdict
from scripts.pyslavseq.igv import make_igv_batch_script, IGV

In [3]:
# define functions to read the breakpoints
def bp_filter(line):
    l = line.strip().split("\t")
    if "L1HS" in l[11] and "L1HS" in l[10]:
        return False
    return True


def read_breakpoints(fn: str):
    res = defaultdict(list)
    with gopen(fn, "rt") as f:
        for line in filterfalse(bp_filter, f):
            l = line.strip().split("\t")
            res["Chromosome"].append(l[0])
            res["Start"].append(int(l[1]) - 1)
            res["End"].append(int(l[2]))
            res["start_support"].append(int(l[3].split(":")[1]))
            res["end_support"].append(int(l[4].split(":")[1]))

            # collect info on breakpoint reads
            for i, j in zip(["start", "end"], [l[10], l[11]]):
                reads, start, end, strand = [], [], [], []
                for k in j.split(";"):
                    if "L1HS" in k:
                        reads.append(k)
                for k in reads:
                    start.append(int(k.split(",")[1]))
                    end.append(int(k.split(",")[2]))
                    strand.append(k.split(",")[3])
                res[i + "_reads"].append(reads)
                res[i + "_l1start"].append(np.mean(start))
                res[i + "_l1end"].append(np.mean(end))
                res[i + "_l1"].append(np.mean([np.mean(start), np.mean(end)]))
                res[i + "_l1strand"].append(set(strand))

            res["l1_size"].append(abs(res["start_l1"][-1] - res["end_l1"][-1]))

    res = pd.DataFrame(res).query("start_support > 1 and end_support > 1")
    return pr.PyRanges(res)

In [2]:
proba = {}
columns = [
    "Chromosome",
    "Start",
    "End",
    "n_reads",
    "n_unique_3end",
    "n_unique_5end",
    "n_duplicates",
    "label",
    "n_cells",
    "germline_dist",
]
regions = pd.read_parquet(snakemake.input.predictions[0])[columns]
for p in snakemake.input.predictions:
    fs = Path(p).parent.name
    regions[f"{fs}_proba"] = pd.read_parquet(p)["test_proba"]
regions

In [None]:
peaks, breakpoints = {}, {}
cells = {
    "A8": "ush1_A8_S178",
    "B3": "ush1_B3_S140",
    "D6": "ush1_D6_S165",
    "bulk": "bulk",
}
for i, (cell, id) in enumerate(cells.items()):
    peaks[cell] = pr.PyRanges(regions.loc[regions["cell_id"] == id][columns])
    breakpoints[cell] = snakemake.input.breakpoints[i]  # type: ignore

print("Reading breakpoints...")
bpts = {k: read_breakpoints(v) for k, v in breakpoints.items()}
bpts["bulk"] = bpts["bulk"].extend(500)
bpts_res = {}
for bulk, cell in product(["bulk"], ["A8", "B3", "D6"]):
    bpts_res[cell] = bpts[cell].overlap(bpts[bulk], invert=True)
    bpts_res[cell] = bpts_res[cell].overlap(peaks[cell]).df

bpts_res = (
    pd.concat(bpts_res).reset_index(level=0, names="cell_id").reset_index(drop=True)
)
bpts_res["l1_size"].hist(bins=100)

In [5]:
bpts_res = bpts_res.query("l1_size > 1000")
bpts_res = {k: pr.PyRanges(v) for k, v in bpts_res.groupby("cell_id")}
bpts_res

{'A8': +------------+--------------+-----------+-----------+-------+
 | cell_id    | Chromosome   | Start     | End       | +13   |
 | (object)   | (category)   | (int64)   | (int64)   | ...   |
 |------------+--------------+-----------+-----------+-------|
 | A8         | chr2         | 36112239  | 36112276  | ...   |
 | A8         | chr3         | 104710432 | 104710603 | ...   |
 | A8         | chr3         | 199121196 | 199121354 | ...   |
 | A8         | chr4         | 125962185 | 125962206 | ...   |
 | ...        | ...          | ...       | ...       | ...   |
 | A8         | chr4         | 125962522 | 125962554 | ...   |
 | A8         | chr4         | 125963623 | 125963652 | ...   |
 | A8         | chr2         | 36112239  | 36112276  | ...   |
 | A8         | chr16        | 25145516  | 25145654  | ...   |
 | A8         | chr16        | 25145748  | 25145782  | ...   |
 | A8         | chr16        | 85429487  | 85429521  | ...   |
 | A8         | chr16        | 85429646  | 854296

In [6]:
peaks_res = {}
for bulk, cell in product(["bulk"], ["A8", "B3", "D6"]):
    peaks_res[cell] = peaks[cell].overlap(bpts_res[cell]).df
peaks_res = pd.concat(peaks_res).query("n_reads >= 10 and germline_dist > 10000")
peaks_res

Unnamed: 0,Unnamed: 1,Chromosome,Start,End,n_reads,n_unique_3end,n_unique_5end,n_duplicates,label,n_cells,germline_dist,test_proba
A8,0,chr2,36111824,36112332,26,26,26,10,OTHER,1,236659,0.224742
A8,3,chr4,125962145,125962802,48,29,42,66,OTHER,207,5114,0.003058
A8,4,chr4,125963252,125964012,214,41,150,361,OTHER,207,6221,0.028273
A8,9,chr7,4814838,4815237,24,11,24,4,OTHER,1,2774806,0.1784
A8,12,chr10,104549908,104550376,22,12,21,3,OTHER,1,1716547,0.402438
B3,0,chr2,2019081,2020228,45,39,44,7,OTHER,1,249013,0.142106
B3,2,chr2,99316843,99317413,36,29,32,17,OTHER,2,1589206,0.264991
B3,6,chr2,147733850,147734476,22,14,21,12,OTHER,1,910455,0.198649
B3,9,chr2,205255674,205256325,53,29,53,12,OTHER,64,6156,0.000258
B3,12,chr3,20715649,20716208,287,107,182,61,KNRGL,335,1370951,0.946617


In [7]:
config = {
    "genome": {
        "url": "https://brainome.ucsd.edu/mcuoco/for_igv/chm13v2.0.XY.fasta",
    },
    "tracks": [
        {
            "label": "Bulk_SLAVseq",
            "url": "https://brainome.ucsd.edu/mcuoco/for_igv/slavseq/gDNA_usd1.tagged.sorted.bam",
        },
        {
            "label": "Bulk_WGS",
            "url": "https://brainome.ucsd.edu/mcuoco/for_igv/bulk_30x_wgs_calls/1/LIBD73.md.bam",
        },
        {
            "label": "A8_SLAVseq",
            "url": "https://brainome.ucsd.edu/mcuoco/for_igv/slavseq/ush1_A8_S178.tagged.sorted.bam",
        },
        {
            "label": "A8_WGS",
            "url": "https://brainome.ucsd.edu/mcuoco/for_igv/single_cell_30x_wgs/A8.md.bam",
        },
        {
            "label": "A8_WGS_te_reads",
            "url": "https://brainome.ucsd.edu/mcuoco/for_igv/single_cell_30x_wgs/A8.bp_reads.bam",
        },
        {
            "label": "B3_SLAVseq",
            "url": "https://brainome.ucsd.edu/mcuoco/for_igv/slavseq/ush1_B3_S140.tagged.sorted.bam",
        },
        {
            "label": "B3_WGS",
            "url": "https://brainome.ucsd.edu/mcuoco/for_igv/single_cell_30x_wgs/B3.md.bam",
        },
        {
            "label": "B3_WGS_te_reads",
            "url": "https://brainome.ucsd.edu/mcuoco/for_igv/single_cell_30x_wgs/B3.bp_reads.bam",
        },
        {
            "label": "D6_SLAVseq",
            "url": "https://brainome.ucsd.edu/mcuoco/for_igv/slavseq/ush1_D6_S165.tagged.sorted.bam",
        },
        {
            "label": "D6_WGS",
            "url": "https://brainome.ucsd.edu/mcuoco/for_igv/single_cell_30x_wgs/D6.md.bam",
        },
        {
            "label": "D6_WGS_te_reads",
            "url": "https://brainome.ucsd.edu/mcuoco/for_igv/single_cell_30x_wgs/D6.bp_reads.bam",
        },
        {
            "label": "RMSK",
            "url": "https://brainome.ucsd.edu/mcuoco/for_igv/chm13v2.0.XY.fasta.all_rmsk.bed",
        },
        {
            "label": "SLAVseq_primers",
            "url": "https://brainome.ucsd.edu/mcuoco/for_igv/L1_primers.bed",
        },
        {
            "label": "MEGANE_WGS_calls",
            "url": "https://brainome.ucsd.edu/mcuoco/for_igv/bulk_30x_wgs_calls/1/megane_gaussian.bed",
        },
    ],
}

In [8]:
peaks_res = pr.PyRanges(peaks_res).merge().df
peaks_res

Unnamed: 0,Chromosome,Start,End
0,chr1,186708907,186709680
1,chr2,2019081,2020228
2,chr2,36111824,36112332
3,chr2,71428292,71428812
4,chr2,71429067,71429648
5,chr2,71430554,71431128
6,chr2,99316843,99317413
7,chr2,147733850,147734476
8,chr2,205255674,205256325
9,chr3,20715649,20716208


In [9]:
outdir = str(Path(snakemake.output[0]).parent.name)
Path(outdir).mkdir(parents=True, exist_ok=True)

In [10]:
script = make_igv_batch_script(config, regions=peaks_res, outdir=outdir)