In [None]:
import itertools as it
import operator
import re
from pathlib import Path

import gfapy
import holoviews as hv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.compute as pc
from pyarrow import csv
from tqdm.auto import tqdm, trange

In [None]:
hv.extension("bokeh")

# Config

In [None]:
data_dir = Path(
    "/home/jqs1/scratch/jqs1/sequencing/230818_bcd_rbses/20230818_1343_1A_PAQ97606_f49ab41c"
)
gaf_filename = data_dir / "temp/mapped_t4.gaf"
gfa = gfapy.Gfa.from_file(data_dir / "references/bcd_rbses.gfa")

# GAF

In [None]:
# SEE: http://samtools.github.io/hts-specs/SAMv1.pdf
# and https://samtools.github.io/hts-specs/SAMtags.pdf
# pyarrow CSV parser only supports pa.dictionary with int32 indices
SAM_TAG_TYPES = {
    "A": pa.dictionary(pa.int32(), pa.string()),
    "f": pa.float32(),
    "i": pa.int32(),
    "Z": pa.string(),
}
GAF_COLUMN_TYPES = {
    "query_length": pa.uint64(),
    "query_start": pa.uint64(),
    "query_end": pa.uint64(),
    "strand": pa.dictionary(pa.int32(), pa.string()),
    "path": pa.string(),
    "path_length": pa.uint64(),
    "path_start": pa.uint64(),
    "path_end": pa.uint64(),
    "residue_matches": pa.uint64(),
    "block_length": pa.uint64(),
    "mapping_quality": pa.uint8(),
}
SAM_TAG_REGEX = re.compile(
    r"^(?P<tag>[a-zA-Z0-9]+):(?P<tag_value>A:.|f:\d+(\.\d+)?|i:\d+|Z:.*)$"
)


def parse_gaf_types(gaf_filename):
    with open(gaf_filename, "r") as f:
        first_row = f.readline().split("\t")
    columns_to_parse = {}
    column_types = []
    for idx in reversed(range(len(first_row))):
        if match := SAM_TAG_REGEX.match(first_row[idx]):
            tag = match.group("tag")
            column_types.append((tag, pa.string()))
            tag_value = match.group("tag_value")
            columns_to_parse[tag] = tag_value[: tag_value.index(":")]
        else:
            break
    column_types.extend(reversed(GAF_COLUMN_TYPES.items()))
    for idx in reversed(range(idx + 1 - len(GAF_COLUMN_TYPES))):
        if match := SAM_TAG_REGEX.match(first_row[idx]):
            tag = match.group("tag")
            column_types.append((tag, pa.string()))
            tag_value = match.group("tag_value")
            type_ = tag_value[: tag_value.index(":")]
            columns_to_parse[tag] = type_
        else:
            if idx != 0:
                raise ValueError("expecting SAM tags following FASTQ read name")
            else:
                column_types.append(("name", pa.string()))
    column_types = dict(reversed(column_types))
    return column_types, columns_to_parse


def parse_gaf_table(table, columns_to_parse):
    # TODO: we could convert string read UUIDs (and semicolon-delimited pairs of UUIDs)
    # to an extension type to save a small amount of space
    # SEE: https://arrow.apache.org/docs/python/extending_types.html#defining-extension-types-user-defined-types
    for tag, type_ in columns_to_parse.items():
        col_idx = table.column_names.index(tag)
        new_column = pc.replace_substring_regex(table[tag], f"{tag}:{type_}:", "").cast(
            SAM_TAG_TYPES[type_]
        )
        table = table.set_column(col_idx, tag, new_column)
    path = pa.array(
        [re.split(r"(?=<|>)", s.as_py())[1:] for s in table.column("path")],
        type=pa.list_(pa.dictionary(pa.int16(), pa.string())),
    )
    table = table.set_column(table.column_names.index("path"), "path", path)
    return table


def parse_gaf(gaf_filename):
    column_types, columns_to_parse = parse_gaf_types(gaf_filename)
    read_options = csv.ReadOptions(column_names=column_types.keys())
    parse_options = csv.ParseOptions(delimiter="\t")
    convert_options = csv.ConvertOptions(column_types=column_types)
    with csv.open_csv(
        gaf_filename,
        read_options=read_options,
        parse_options=parse_options,
        convert_options=convert_options,
    ) as f:
        while True:
            try:
                table = parse_gaf_table(
                    pa.Table.from_batches([f.read_next_batch()]), columns_to_parse
                )
            except StopIteration:
                break
            yield table

In [None]:
from collections import Counter

# Completeness

In [None]:
%%time
segments = Counter()
ends = Counter()
total_reads = 0
for table in tqdm(parse_gaf(gaf_filename)):
    path_col = table.column("path")
    for idx in range(len(table)):
        path = [s[1:] for s in path_col[idx].as_py()]
        segments.update(path)
        ends[path[0]] += 1
        ends[path[-1]] += 1
        total_reads += 1

In [None]:
for k, v in sorted(
    {k: f"{v/total_reads*100:.0f}" for k, v in segments.items()}.items()
):
    print(f"{k}: {v}%")

In [None]:
for k, v in sorted({k: f"{v/total_reads*100:.0f}" for k, v in ends.items()}.items()):
    print(f"{k}: {v}%")

# Duplex barcode mismatches

In [None]:
%%time
total_reads = 0
complete_barcodes = 0
name_to_barcode = {}
for table in tqdm(parse_gaf(gaf_filename)):
    name_col = table.column("name")
    path_col = table.column("path")
    for idx in range(len(table)):
        name = name_col[idx].as_py()
        path = set([s[1:] for s in path_col[idx].as_py()])
        total_reads += 1
        if ("BIT0:0" in path or "BIT0:1" in path) and (
            "BIT29:0" in path or "BIT29:1" in path
        ):
            complete_barcodes += 1
            barcode = tuple(f"BIT{bit}:1" in path for bit in range(30))
            name_to_barcode[name] = barcode

In [None]:
(complete_barcodes, total_reads, complete_barcodes / total_reads)

In [None]:
duplex_matches = []
duplex_mismatches = []
duplex_missing = []
for name, barcode in tqdm(name_to_barcode.items()):
    reads = name.split(";")
    if len(reads) == 2:
        if reads[0] in name_to_barcode and reads[1] in name_to_barcode:
            if name_to_barcode[reads[0]] != name_to_barcode[reads[1]]:
                duplex_mismatches.append(name)
            else:
                duplex_matches.append(name)
        else:
            duplex_missing.append(name)

In [None]:
name_to_barcode["e7a0f1dc-d947-4265-9dd4-d4cda25a0928"] == name_to_barcode[
    "50815360-6914-41f9-8da8-1882c8db69e6"
]

In [None]:
len(duplex_missing)

In [None]:
len(duplex_matches)

In [None]:
len(duplex_mismatches)

In [None]:
duplex_mismatches[10]