In [None]:
import itertools as it

import gfapy
import holoviews as hv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
from pyarrow import csv
from tqdm.auto import tqdm

In [None]:
hv.extension("bokeh")

In [None]:
import re
from collections import Counter

In [None]:
def segment_frequences(table, segment_names):
    rows = []
    for path in table.column("path"):
        read_segments = re.split(r">|<", str(path))
        segment_counts = Counter(read_segments)
        rows.append([segment_counts[s] for s in segment_names])
    return pd.DataFrame(rows, columns=segment_names, index=table.column("name"))

In [None]:
# read1: id
# read2: id if duplex, null otherwise

In [None]:
# gaf row -> row (columns: segments, values: cigar strings with local coördinates [duplicating cigar if necessary, null if segment does not appear)
# filter by duplex vs simplex
# filter out full length from partial-length and unexpected topology (e.g., repeated segments)
# for each segment:
# - number of times it appears in path
# - localized CIGAR (suitable for aggregation, optionally weighted by read quality??)
# get quality score from read???

# NM: edit distance to reference
# AS: alignment score
# dv: ?
# id: ?

In [None]:
gfa = gfapy.Gfa.from_file("nao745bc.gfa")

In [None]:
!du -hs *.gaf

In [None]:
# gaf_filename = "duplex_hac1_subsample_dbg.gaf"
# gaf_filename = "duplex_hac1_subsample_dbg2.gaf"
# gaf_filename = "duplex_hac1_subsample_vg2.gaf"
gaf_filename = "duplex_sup1_vg2.gaf"

In [None]:
%%time
# PATH_REGEX = r"([><][^\s><]+(:\d+-\d+)?)+|([^\s><]+)"
# SEE: http://samtools.github.io/hts-specs/SAMv1.pdf
# and https://samtools.github.io/hts-specs/SAMtags.pdf
SAM_TAG_TYPES = {
    "A": pa.dictionary(pa.int32(), pa.string()),
    "f": pa.float32(),
    "i": pa.int32(),
    "Z": pa.string(),
}
with open(gaf_filename, "r") as f:
    first = f.readline().split("\t")
tags = first[12:]
tag_column_types = {(t := tag.split(":"))[0]: SAM_TAG_TYPES[t[1]] for tag in tags}
column_types = {
    "name": pa.string(),
    "query_length": pa.uint64(),
    "query_start": pa.uint64(),
    "query_end": pa.uint64(),
    "strand": pa.dictionary(pa.int32(), pa.string()),
    "path": pa.string(),
    "path_length": pa.uint64(),
    "path_start": pa.uint64(),
    "path_end": pa.uint64(),
    "residue_matches": pa.uint64(),
    "block_length": pa.uint64(),
    "mapping_quality": pa.uint8(),
    **{tag: pa.string() for tag in tag_column_types.keys()},
}
read_options = csv.ReadOptions(column_names=column_types.keys())
parse_options = csv.ParseOptions(delimiter="\t")
convert_options = csv.ConvertOptions(column_types=column_types)
with csv.open_csv(
    gaf_filename,
    read_options=read_options,
    parse_options=parse_options,
    convert_options=convert_options,
) as f:
    # tt = f.read_next_batch()
    segment_counts = Counter()
    barcode_counts = Counter()
    total = 0
    duplex = 0
    singleton = 0
    filtered = 0
    # while True:
    # for _ in tqdm(it.islice(it.count(), 10)):
    for _ in tqdm(it.count()):
        try:
            table = f.read_next_batch()
        except StopIteration:
            break
        # break
        freqs = segment_frequences(table, gfa.segment_names)
        duplex += freqs.index.str.contains(";").sum()
        # filtered_freqs = freqs[(freqs.max(axis=1) == 1) & ((freqs["BIT0OFF"] == 1) | (freqs["BIT0ON"] == 1)) & (freqs["pPhlF"] == 1)]
        filtered_freqs = freqs[
            # freqs.index.str.contains(";")
            (freqs.max(axis=1) == 1)
            & ((freqs["BIT0OFF"] == 1) | (freqs["BIT0ON"] == 1))
            & (freqs["RBS1"] == 1)
        ]
        # filtered_freqs = freqs
        filtered += len(filtered_freqs)
        total += len(freqs)
        segment_counts.update(list(filtered_freqs.itertuples(index=False)))
        barcode_counts.update(
            list(
                filtered_freqs.loc[
                    :, filtered_freqs.columns.str.startswith("BIT")
                ].itertuples(index=False)
            )
        )

In [None]:
(filtered, total, filtered / total, duplex, duplex / total)

In [None]:
def bit_sums(freqs):
    return pd.DataFrame(
        {f"BIT{i}": freqs[f"BIT{i}ON"] + freqs[f"BIT{i}OFF"] for i in range(30)}
    )

In [None]:
filtered_freqs.mean(axis=0)

In [None]:
filtered_freqs.loc[:, filtered_freqs.columns.str.endswith("ON")].mean(axis=0).plot.bar()

In [None]:
filtered_freqs.mean(axis=0).plot.bar()

In [None]:
segment_counts.most_common(3)

In [None]:
barcode_counts.most_common(3)

In [None]:
plt.hist(barcode_counts.values(), bins=100, log=True);

In [None]:
n, bins, patches = plt.hist(
    barcode_counts.values(),
    100,
    histtype="step",
    density=False,
    cumulative=-1,
    log=True,
)

In [None]:
n, bins, patches = plt.hist(
    barcode_counts.values(),
    100,
    range=(0, 10),
    histtype="step",
    density=False,
    cumulative=-1,
    log=True,
)

In [None]:
len(barcode_counts)

In [None]:
sum(1 for v in barcode_counts.values() if v == 1)

In [None]:
sum(1 for v in barcode_counts.values() if v == 2)

In [None]:
sum(1 for v in barcode_counts.values() if v == 3)

In [None]:
sum(1 for v in barcode_counts.values() if v == 4)

In [None]:
sum(1 for v in barcode_counts.values() if v >= 5)

In [None]:
sum(1 for v in barcode_counts.values() if 5 <= v < 20)

In [None]:
sum(1 for v in barcode_counts.values() if 20 <= v < 100)

In [None]:
sum(1 for v in barcode_counts.values() if v >= 100)

In [None]:
sum(1 for v in barcode_counts.values() if v >= 100)

In [None]:
sum(1 for v in barcode_counts.values() if v >= 2)

In [None]:
max(barcode_counts.values())

In [None]:
sum(v for v in barcode_counts.values() if v == 1) / total

In [None]:
sum(1 for v in barcode_counts.values() if v >= 2)

In [None]:
sum(v for v in barcode_counts.values() if v >= 10)

In [None]:
sum(v for v in barcode_counts.values() if v >= 10) / filtered

In [None]:
sum(v for v in barcode_counts.values() if v >= 50) / filtered

In [None]:
sum(v for v in barcode_counts.values() if v >= 100) / filtered

In [None]:
filtered

In [None]:
sum(v for v in barcode_counts.values())

In [None]:
n, bins, patches = plt.hist(
    barcode_counts.values(),
    100,
    range=(0, 20),
    histtype="step",
    density=True,
    cumulative=-1,
    log=True,
)

In [None]:
f = extract_barcodes(tt, gfa.segment_names)

In [None]:
tuple(next(f.iterrows())[1].values)

In [None]:
f.max(axis=1)

In [None]:
tt.column("path")

In [None]:
re.split(r">|<", s)

In [None]:
paths = tt.select(["path"]).to_pandas()

In [None]:
s = paths.iloc[0, 0]

In [None]:
PATH_REGEX

In [None]:
m = re.match(PATH_REGEX, paths.iloc[0, 0])

In [None]:
m.groups()

In [None]:
tt.take([0]).to_pandas()

In [None]:
gfa.try_get_segment("BIT1OFF")

In [None]:
tt2 = tt.to_pandas()

In [None]:
tt2["path_length"].plot.hist(bins=50)

In [None]:
tt2["mapping_quality"].value_counts()

In [None]:
t = table.to_pandas()

In [None]:
import uuid

In [None]:
u = uuid.UUID("6e507a8a-c271-4561-8768-0f9bf9d4c301")

In [None]:
import sys

In [None]:
sys.getsizeof(u.int)

In [None]:
sys.getsizeof("6e507a8a-c271-4561-8768-0f9bf9d4c301")

In [None]:
u.int

In [None]:
t["cg"][0]

In [None]:
t["name"].str.split(";")

In [None]:
# 1) segment_cigars df (segment coördinates, normalize orientation) [numba]
# 2) segment_mismatches df (cellwise apply, get insertions/deletions/mismatches/equal)
# 3) filter on barcode mismatches (?)
# 4) group segment_cigars by barcode, run cigar_aggregation on non-barcode (or all!) segments
# 5)

In [None]:
import stream
import vg_pb2

In [None]:
gam_filename = "duplex_sup1_subsample_vg2.gam"

In [None]:
from itertools import islice

In [None]:
len(
    "CACTGGAAACATCAAGGTCGACGAGGAGAGCGCGGGTGAGAGGGATTCGTTACCAATAGAGCAACGTACGACGTTCAATATAATGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACCAGTCCCTAAAGAACGAAACAACCGCCTCTACAAATAATTTTGTTTAACCATAACAAAGAAAGGGGGTATTCTAATGAAACCAGTAACGTTATACGATGTCGCAGAGTATGCCGGTGTCTTCTTATATGACCGTTTCCCGCGTGGTGAACCAGGCCAGCCACGTTCTGCGAAAACGCGGAAAAAGTGGAAGCGGCGATGGTGGAGCTGAATTACATTCCCGACCGCGTGGCACAACAACTGGCGGGCAAACAGTCGTTGCTGATTGGCGTTGCCACCTCCAGTCTGGCCCTGCACGCGCCGTCGCAAATTGTCGCGGCGATTAAATTCGCGCCGATCAACTGGGTGCGCCAGCGTGGTGGTCTGAATCGGTAGAACGAAGCGGCGTCGAAGCCTGTAAAGCGGCGGTGCACAATCTTCTCGCGCAACGCGTCAGGTGGGCTGATCATTAACTATCCGCTGGATGACCAGGATGCCATTGCTGTGGAAGCTGCCTGCACTAATGTTCCGGCGTTATTTCTTGATGTTCTGACCAGACACCCATCAACAGTATTATTTACTCCCATGAGGACGGTACGCGACTGGGCGTGGAGCATCTGGTCGCATTGGGTCACCAGCAAATCGCGCTTGTTAGCGGGCCCATTAAGTTCTGTCTCGGCGCGTCTGCGTCTGGCTGGCTGGCATAAATATCTCACTCGCAATCAAATTCAGCCGATAGCGGAACGGGAAGGCGACTGGAGTGCCATGTCGGTTTTCAACAAACCATGCAAATGCTGAATGAGGGCATCGTTCCCACTGCGATGCTGGTTGCCAACGATCAGATGGCGCTGGGCGCAATGCGCGCCATTACCGAGTCCGGGCTGCGCGTTGGTGCGGATATCTCGGTAGTGGTAGGATACGACGATACCGAAGATAGCTCATGTTAATCCCGCCGTTAACCACCATCAACAGGATTTTTCGCCTGCTGGGGCAAACCAGCGTGGACCGCTTGCTGCAACTCTCAGGCCAGGCGGTGAAGGGCAATCAGCTGTTGCCAGTCTCACTGGTGAAAAGAAAAACCACCCTGGCGCCCAATACGCAAACCGCCTCTCCCCGCGCGTTGGCCGATTCATTAATGCAGCTGGCACGACAGGTTTCCCGACTGGAAAGCAGGCAGTAATAAAGGTCCGGCAATTAAAAAAGCGGCTAACCACGCCGCTTTTTTTAGATGTCACTCGGTACCAAATTCCAGAAAGAGGCCTCCCGAAAGGGGGGCCTTTTTTCGTTTTTGGTCCGCTTTGAAGGAGTGTTGACAATTAATCATCGGCTCGATAATGTGTGGAATTGTGAGCGCTCACAATTTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTCAACCATAACAAAGAAAGGGGGTCCTGTAATGGCACGTACCCGAGCCGTAGCAGCATTGGTAGCCTGCGTAGTCCGCATACCCATAAAGCAATTCTCGACCAGCACCATTGAAATCCTGAAAAGAATGTGGTTATAGCGGTCTGAGCATTGAAAGCGTGGCACGTCGCGCCGGTGCAGGCAAACCGACCATTCATCGTTGGTGGACCAACAAAGCAGCACTGATTGCCGAAGTGTATGAATGAAATCGAACAGGTACGTAAATTTCCGGATTTGGGTAGCTTTAAAGCCGATCTGGATTTTCTGCTGCATAATCTGTGGAAAGTTTGGCGTGAAACCATTTGTGGTGAAGCATTTCGTTGTGTTATTGCAGAAGCACAGTTGGACCCTGCAACCCTGACCCAACTGAAAGATCAGTTTATGGAACGTCGTCGTGAGATACCGAAAAAACTGGTTGAAGATGCCATTAGCAATGGTGAACTGCCGAAAGATATCAATCGTGAACTGCTGCTGGATATGATTTTTGGTTTTTGGTTGGTATCGCCTGCTGACCGAACAGTTGACCGTTGAACAGGATCGTAAAGATTGACCTTCCTGCTGATTAATGGTGTTTGTCCGGGTACACAGCGTCAATAAGGTCCGGCTTATCGGTCAGTTTCACCTGATTTACGTAAAAACCCGCTTCGGCGGGTTTTTGCTTTTGAGGGGCAGAAAGATGAATGACTGTCCACGACGCTATACCCAAAAGAAAAAAAACCCCGCCCGTGACAGGGCGGGTTTTTTTTTGCTTAAATAGGAGCGACGTACGGTGGAATCTGATTCGTTACCAATTGACATGATACGGAACGTACCGTATCGTTAAGGTTACTAGCGCTGTCTGGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAACCATAACAGAGAAAGGGGGTTTTCTGAATGCCGAAACTGGGTATGCAGAGCATTCGTCGTCGTCAGCTGATTGATGCAACCCTGGAAGCAATTAATGAAGTTGGTATGCATGATGCAACCATTGCACAGATTGCACGTCGTGCCGGTGTTAGCACCGTATGATTAGCCATTATTTCCGCGATAAACGGTCGACTGGATGGAAGCAACGATGCGTGATATTACCAGCCAGCTGCGTGATGCAGTTCTGAATCGTCTGCATGCACTGCCGCAGGGTAGCGCAGAACAGCGTCTGCAGGCAATTGTTGGTGGTAATTTTGATGAAACCCAGGTTAGCAGCGCAGCAATGAAAGCATGGCTGGCATTTGGGCAATCGAGCATGCATCAGCCGATGCTGTATCGTCTGCAGCAGGTTAGCAGTCGTCGTCTGCTGAGCAATCTGGTTAGCGAATTTCGTCCGTGAACTGCCTCGTGAACAGGCACAAGAGGCAGGTTATGGTCTGGCAGCACTGATTGATGGTCTGTGGCTGCGTGCAGCACTGAAGCGGTAAACCGCTCGGATAAAACCCGTGCAAATAAGCCTGACCCGTCATTTTATCACCCAGCATCTGCCGACCGATTAATAAGGTCTTTAAAAAGAAACCTCCGCATTGCGGAGGTTTCGCCTTTTGATACTCTGTCTGAAGTAATTCTTGCCGCAGTGAAAAATGGTGCCCATCGGCGCCATTTTTTTATGCTTCCATTAGAAAGCAAAAAGCCTGCTAGAAAGCAGGCTTTTAATTTGGCTCCTCGGGCACGCTTACTGAGGAGCGAGGATAAGATAAAATTACCACAACTGCGAGCCCTTCCACCAAAAAAAACAAGATAGCCGCGCGAACGCGGCTAACTGTTGAAAAAAACAGATAACAGATACGAAGAATCTGTTATCGTAAAAACCCCTCAAAGACCGTTTAAGAAGGCCCAAGGGGTTATTACTGATGGCAATGTGATGACCTCATCATTCGCTCATCATCACCAACTGAATCTCACCTTCCACTTCACGATCCATATCCTTCTCACCCTGACCTTTCTCCATACCCAACTTTCCTAACAACCAACTAACCTGCCTCTTCATCAGTCCATGATTCTCCCTCTATCAGCTCCAGCAACCACACTCATAATGCATATCCAACCACAACCTCAGTCAACTCATTACCCACAACCGCCCTTACCAACAACAATCCTGATTCCATACCCACTCCGTATCCTTCAATCCCTCCACAGCATAACCCTACACACAACACGTATCGTTATGTAATTGCTAGACCCTTTACAAACACACCTGAGGTAAACCCATCCCACATCCTGTCAATCCACCATTCCTCAACGAAACTTCATCACTCTCCTCCTCACTAATTCACAACTCTCGTCCCCCACTTCACTTCACTATGCTTCTTCTCTTACACCCTCTGTATCATCCTCCTTCTCTCACCACCATAACTCCATCCTTCGCCCTCTTACTTATCTGACCCAGACCAACTTCCACACTCACTGGATCTCATCAATCCCA"
)

In [None]:
!head -n 4 duplex_sup1_subsample.fastq

In [None]:
for msg in islice(stream.parse(gam_filename, vg_pb2.Alignment), 2):
    print(msg)
    print()
    print("************")
    print()

In [None]:
for msg in stream.parse(gam_filename, vg_pb2.Alignment):
    if ";" in msg.name:
        continue
    print(msg)
    print()
    print("************")
    print()
    break

In [None]:
msg.name

In [None]:
msg.quality