In [None]:
import itertools as it
import operator
import re
from collections import Counter
from pathlib import Path

import gfapy
import holoviews as hv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.compute as pc
import pyfastx
from pyarrow import csv
from tqdm.auto import tqdm, trange

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import paulssonlab.sequencing.gaf as gaf

In [None]:
hv.extension("bokeh")

# Config

In [None]:
data_dir = Path("/home/jqs1/scratch/jqs1/sequencing/230930_alignment_test/")
# gaf_filename = data_dir / "barcode.gfa"
# gfa = gfapy.Gfa.from_file(data_dir / "references/bcd_rbses.gfa")

# Completeness

In [None]:
# gaf_filename = data_dir / "230707_repressilators/channel-135_merged.gaf"
# gaf_filename = data_dir / "230726_carlos/channel-100_merged.gaf"
# gaf_filename = data_dir / "230818_bcd_rbses/channel-100_merged.gaf"
gaf_filename = data_dir / "230818_repressilators/channel-1032_merged.gaf"
# gaf_filename = data_dir / "230922_bcd_rbses_constitutive/channel-100_merged.gaf"

In [None]:
%%time
segments = Counter()
ends = Counter()
total_reads = 0
for table in tqdm(gaf.iter_gaf(gaf_filename)):
    path_col = table.column("path")
    for idx in range(len(table)):
        path = [s[1:] for s in path_col[idx].as_py()]
        segments.update(path)
        ends[path[0]] += 1
        ends[path[-1]] += 1
        total_reads += 1

In [None]:
for k, v in sorted(
    {k: f"{v/total_reads*100:.0f}" for k, v in segments.items()}.items()
):
    print(f"{k}: {v}%")

In [None]:
for k, v in sorted({k: f"{v/total_reads*100:.0f}" for k, v in ends.items()}.items()):
    print(f"{k}: {v}%")

# Duplex barcode mismatches

In [None]:
%%time
total_reads = 0
complete_barcodes = 0
name_to_barcode = {}
name_to_path = {}
for table in tqdm(gaf.iter_gaf(gaf_filename)):
    name_col = table.column("name")
    path_col = table.column("path")
    for idx in range(len(table)):
        name = name_col[idx].as_py()
        path = set([s[1:] for s in path_col[idx].as_py()])
        name_to_path[name] = path
        total_reads += 1
        if ("BC:BIT0=0" in path or "BC:BIT0=1" in path) and (
            "BC:BIT29=0" in path or "BC:BIT29=1" in path
        ):
            complete_barcodes += 1
            barcode = tuple(f"BIT{bit}:1" in path for bit in range(30))
            name_to_barcode[name] = barcode

In [None]:
%%time
reads = pyfastx.Fastq(str(gaf_filename).replace(".gaf", ".fastq"))

In [None]:
%%time
duplex_ids = set(k for k in reads.keys() if ";" in k)

In [None]:
%%time
simplex_ids = set(k for k in reads.keys() if ";" not in k)

In [None]:
(
    complete_barcodes,
    total_reads,
    complete_barcodes / total_reads,
    len(duplex_ids),
    len(duplex_ids) / total_reads,
)

In [None]:
duplex_mismatches = []
duplex_matches = []
duplex_missingone = []
duplex_missingboth = []
duplex_nobarcode = []
for duplex_id in tqdm(duplex_ids):
    if duplex_id not in name_to_barcode:
        duplex_nobarcode.append(duplex_id)
    else:
        reads = duplex_id.split(";")
        num_alignments = sum(read in name_to_barcode for read in reads)
        if num_alignments == 0:
            duplex_missingboth.append(duplex_id)
        elif num_alignments == 1:
            duplex_missingone.append(duplex_id)
        else:
            if name_to_barcode[reads[0]] == name_to_barcode[reads[1]]:
                duplex_matches.append(duplex_id)
            else:
                duplex_mismatches.append(duplex_id)

In [None]:
(
    len(duplex_nobarcode),
    len(duplex_mismatches),
    len(duplex_matches),
    len(duplex_missingone),
    len(duplex_missingboth),
)

In [None]:
def partial_barcode_mismatches(a, b):
    return set(k for k, v in Counter(s.split("=")[0] for s in (a ^ b)).items() if v > 1)

In [None]:
name_to_path[duplex_nobarcode[8]]

In [None]:
duplex_read = duplex_nobarcode[8]
parents = duplex_read.split(";")
print("1>", name_to_path[parents[0]])
print("2>", name_to_path[parents[1]])
m = partial_barcode_mismatches(name_to_path[parents[0]], name_to_path[parents[1]])
print()
print(m)

In [None]:
for duplex_read in duplex_nobarcode[:1]:
    parents = duplex_read.split(";")
    print("1>", name_to_path[parents[0]])
    print("2>", name_to_path[parents[1]])
    m = partial_barcode_mismatches(name_to_path[parents[0]], name_to_path[parents[1]])
    print(m)

In [None]:
a = name_to_path[parents[0]]
b = name_to_path[parents[1]]

In [None]:
b

In [None]:
a & b

In [None]:
a ^ b

In [None]:
duplex_nobarcode_distances = []
no_parent_alignment = []
for duplex_read in duplex_nobarcode:
    parents = duplex_read.split(";")
    if parents[0] not in name_to_path or parents[1] not in name_to_path:
        no_parent_alignment.append(duplex_read)
    else:
        m = partial_barcode_mismatches(
            name_to_path[parents[0]], name_to_path[parents[1]]
        )
        duplex_nobarcode_distances.append(len(m))

In [None]:
(len(no_parent_alignment), len(duplex_nobarcode_distances))

In [None]:
sorted(Counter(duplex_nobarcode_distances).items())

In [None]:
!micromamba install -y tabulate

In [None]:
print(pd.DataFrame([{"foo": 100, "bar": 200}]).to_markdown())