In [None]:
import itertools as it
import operator
import re
from collections import Counter
from pathlib import Path

import gfapy
import holoviews as hv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.compute as pc
from pyarrow import csv
from tqdm.auto import tqdm, trange

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import paulssonlab.sequencing.gaf as gaf

In [None]:
hv.extension("bokeh")

# Config

In [None]:
data_dir = Path(
    "/home/jqs1/scratch/jqs1/sequencing/230818_bcd_rbses/20230818_1343_1A_PAQ97606_f49ab41c"
)
gaf_filename = data_dir / "temp/mapped_t4.gaf"
gfa = gfapy.Gfa.from_file(data_dir / "references/bcd_rbses.gfa")

# Completeness

In [None]:
%%time
segments = Counter()
ends = Counter()
total_reads = 0
for table in tqdm(gaf.parse_gaf(gaf_filename)):
    path_col = table.column("path")
    for idx in range(len(table)):
        path = [s[1:] for s in path_col[idx].as_py()]
        segments.update(path)
        ends[path[0]] += 1
        ends[path[-1]] += 1
        total_reads += 1

In [None]:
for k, v in sorted(
    {k: f"{v/total_reads*100:.0f}" for k, v in segments.items()}.items()
):
    print(f"{k}: {v}%")

In [None]:
for k, v in sorted({k: f"{v/total_reads*100:.0f}" for k, v in ends.items()}.items()):
    print(f"{k}: {v}%")

# Duplex barcode mismatches

In [None]:
%%time
total_reads = 0
complete_barcodes = 0
name_to_barcode = {}
for table in tqdm(parse_gaf(gaf_filename)):
    name_col = table.column("name")
    path_col = table.column("path")
    for idx in range(len(table)):
        name = name_col[idx].as_py()
        path = set([s[1:] for s in path_col[idx].as_py()])
        total_reads += 1
        if ("BIT0:0" in path or "BIT0:1" in path) and (
            "BIT29:0" in path or "BIT29:1" in path
        ):
            complete_barcodes += 1
            barcode = tuple(f"BIT{bit}:1" in path for bit in range(30))
            name_to_barcode[name] = barcode

In [None]:
(complete_barcodes, total_reads, complete_barcodes / total_reads)

In [None]:
duplex_matches = []
duplex_mismatches = []
duplex_missing = []
for name, barcode in tqdm(name_to_barcode.items()):
    reads = name.split(";")
    if len(reads) == 2:
        if reads[0] in name_to_barcode and reads[1] in name_to_barcode:
            if name_to_barcode[reads[0]] != name_to_barcode[reads[1]]:
                duplex_mismatches.append(name)
            else:
                duplex_matches.append(name)
        else:
            duplex_missing.append(name)

In [None]:
name_to_barcode["e7a0f1dc-d947-4265-9dd4-d4cda25a0928"] == name_to_barcode[
    "50815360-6914-41f9-8da8-1882c8db69e6"
]

In [None]:
len(duplex_missing)

In [None]:
len(duplex_matches)

In [None]:
len(duplex_mismatches)

In [None]:
duplex_mismatches[10]