In [None]:
import itertools as it
import operator
import re
import subprocess
import tempfile
import time
from collections import Counter, defaultdict
from functools import partial
from pathlib import Path

import duckdb
import gfapy
import holoviews as hv
import hvplot.pandas
import ibis
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
import parasail
import polars as pl
import pyabpoa
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.dataset as ds
import pyfastx
import pysam
import spoa
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from pyarrow import csv
from pywfa import WavefrontAligner
from tqdm.auto import tqdm, trange

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import paulssonlab.sequencing.align as align
import paulssonlab.sequencing.cigar as scigar
import paulssonlab.sequencing.consensus as con
import paulssonlab.sequencing.gfa as sgfa
import paulssonlab.sequencing.io as sio
import paulssonlab.sequencing.processing as processing
from paulssonlab.util.sequence import reverse_complement

In [None]:
hv.extension("bokeh")

In [None]:
pl.enable_string_cache()

# 231201_bcd_rbses

In [None]:
# gaf_filename = "/home/jqs1/scratch/sequencing/231130_bcd_rbses_run3_minion/20231130_1904_MN35044_FAX40126_ee95ee31/_temp/vg1.gaf"
gaf_filename = "/home/jqs1/scratch/sequencing/231201_bcd_rbses_run3/20231201_1101_1F_PAU05823_773c75ee/_temp/vg.gaf"
gaf = sio.read_gaf(gaf_filename)
gaf_df = pl.from_arrow(gaf)

In [None]:
seg_counts = Counter()
for idx in trange(len(gaf)):
    seg_counts.update([s[1:] for s in gaf["path"][idx].as_py()])

In [None]:
seg_counts

In [None]:
refs = {
    "pLIB430": (
        "GCACTGAAGGTCCTCAATCGCACTGGAAACATCAAGGTCGACGAGGAGTTTACGGCTAGCTCAGTCCTAGGTATAGTGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAACCATAGAGAAAGAGGAGAAATACTAATGGTCAGTAAAGGCGAAGCAGTTATCAAAGAGTTCATGCGCTTCAAAGTTCATATGGAAGGGTCGATGAACGGGCACGAATTTGAAATTGAAGGCGAAGGCGAAGGCCGCCCATATGAAGGGACCCAAACCGCAAAGCTTAAGGTTACTAAAGGCGGTCCATTACCCTTTTCGTGGGACATTTTAAGCCCACAGTTTATGTACGGGAGTCGCGCTTTCATCAAGCACCCTGCGGACATCCCAGATTACTACAAACAGTCTTTCCCCGAGGGGTTCAAGTGGGAGCGCGTGATGAACTTCGAGGATGGCGGAGCCGTGACGGTCACCCAAGATACCTCTTTGGAGGACGGTACGTTGATCTACAAAGTGAAATTGCGTGGCACGAATTTTCCACCTGATGGGCCTGTCATGCAGAAAAAGACAATGGGATGGGAAGCTTCCACGGAGCGCCTTTACCCAGAGGACGGTGTTCTTAAAGGGGATATCAAAATGGCGCTGCGTCTTAAAGATGGAGGCCGCTACCTGGCGGACTTCAAGACTACTTACAAGGCCAAAAAACCAGTGCAGATGCCGGGTGCGTACAATGTAGATCGTAAATTAGATATTACAAGTCACAATGAAGATTACACGGTCGTAGAGCAGTATGAGCGCAGTGAGGGGCGTCACTCTACGGGCGGTATGGACGAGTTATACAAGTAAAGGTGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTTCGACCAAAGGCTCGGTACCAAATTCCAGAAAAGACACCCGAAAGGGTGTTTTTTCGTTTTGGTCCGCTTATGAAGGAGTTGACGGCTAGCTCAGTCCTAGGTACAGTGCTAGCTACTAGCTGTCACCGGATGTGCTTTCCGGTCTGATGAGTCCGTGAGGACGAAACAGCCTCTACAAATAATTTTGTTTAAGGGCCCAAGTTCACTTAAAAAGGAGATCAACAATGAAAGCAATTTTCGTACTGAAACATCTTAATCATGC",
        "NNNNNNNNN",
        "TTAAAAATGAGTAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCAACATACGGAAAACTTACCCTTAAATTGATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTGGGTTATGGTGTTCAATGCTTTGCGAGATACCCAGATCATATGAAACAGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAAAGAACTATATTTTTCAAAGATGACGGGAACTACAAGACACGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATAGAATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTTGGACACAAATTGGAATACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCGAACTTCAAAATTAGACACAACATTGAAGATGGAGGTGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCCTACCAATCTAAGCTTTCGAAAGATCCCAACGAAAAGAGAGATCACATGGTCCTTCTTGAGTTTGTAACAGCTGCTGGGATTACACATGGCATGGATGAACTATACAAATAAGGTCCGGCTTATCGGTCAGTTTCACCTGATTTACGTAAAAACCCGCTTCGGCGGGTTTTTGCTTTTGGAGGGGCAGAAAGATGAATGACTGTCCACGACGCTATACCCAAAAGAAAAAAAAAAAACCCCGCCCCTGACAGGGCGGGGTTTTTTTTGCTTA",
    ),
    "pLIB431": (
        "GCACTGAAGGTCCTCAATCGCACTGGAAACATCAAGGTCGACGAGGAGTTTACGGCTAGCTCAGTCCTAGGTATAGTGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAACCATAGAGAAAGAGGAGAAATACTAATGGTCAGTAAAGGCGAAGCAGTTATCAAAGAGTTCATGCGCTTCAAAGTTCATATGGAAGGGTCGATGAACGGGCACGAATTTGAAATTGAAGGCGAAGGCGAAGGCCGCCCATATGAAGGGACCCAAACCGCAAAGCTTAAGGTTACTAAAGGCGGTCCATTACCCTTTTCGTGGGACATTTTAAGCCCACAGTTTATGTACGGGAGTCGCGCTTTCATCAAGCACCCTGCGGACATCCCAGATTACTACAAACAGTCTTTCCCCGAGGGGTTCAAGTGGGAGCGCGTGATGAACTTCGAGGATGGCGGAGCCGTGACGGTCACCCAAGATACCTCTTTGGAGGACGGTACGTTGATCTACAAAGTGAAATTGCGTGGCACGAATTTTCCACCTGATGGGCCTGTCATGCAGAAAAAGACAATGGGATGGGAAGCTTCCACGGAGCGCCTTTACCCAGAGGACGGTGTTCTTAAAGGGGATATCAAAATGGCGCTGCGTCTTAAAGATGGAGGCCGCTACCTGGCGGACTTCAAGACTACTTACAAGGCCAAAAAACCAGTGCAGATGCCGGGTGCGTACAATGTAGATCGTAAATTAGATATTACAAGTCACAATGAAGATTACACGGTCGTAGAGCAGTATGAGCGCAGTGAGGGGCGTCACTCTACGGGCGGTATGGACGAGTTATACAAGTAAAGGTGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTTCGACCAAAGGCTCGGTACCAAATTCCAGAAAAGACACCCGAAAGGGTGTTTTTTCGTTTTGGTCCGCTTATGAAGGAGTTGACGGCTAGCTCAGTCCTAGGTACAGTGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAACCAT",
        "AACAWAGAAAGGGGGTHHHCK",
        "AATGAGTAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCAACATACGGAAAACTTACCCTTAAATTGATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTGGGTTATGGTGTTCAATGCTTTGCGAGATACCCAGATCATATGAAACAGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAAAGAACTATATTTTTCAAAGATGACGGGAACTACAAGACACGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATAGAATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTTGGACACAAATTGGAATACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCGAACTTCAAAATTAGACACAACATTGAAGATGGAGGTGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCCTACCAATCTAAGCTTTCGAAAGATCCCAACGAAAAGAGAGATCACATGGTCCTTCTTGAGTTTGTAACAGCTGCTGGGATTACACATGGCATGGATGAACTATACAAATAAGGTCCGGCTTATCGGTCAGTTTCACCTGATTTACGTAAAAACCCGCTTCGGCGGGTTTTTGCTTTTGGAGGGGCAGAAAGATGAATGACTGTCCACGACGCTATACCCAAAAGAAAAAAAAAAAACCCCGCCCCTGACAGGGCGGGGTTTTTTTTGCTTA",
    ),
    "pLIB432_B0032m": (
        "GCACTGAAGGTCCTCAATCGCACTGGAAACATCAAGGTCGACGAGGAGTTTACGGCTAGCTCAGTCCTAGGTATAGTGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAACCATAGAGAAAGAGGAGAAATACTAATGGTCAGTAAAGGCGAAGCAGTTATCAAAGAGTTCATGCGCTTCAAAGTTCATATGGAAGGGTCGATGAACGGGCACGAATTTGAAATTGAAGGCGAAGGCGAAGGCCGCCCATATGAAGGGACCCAAACCGCAAAGCTTAAGGTTACTAAAGGCGGTCCATTACCCTTTTCGTGGGACATTTTAAGCCCACAGTTTATGTACGGGAGTCGCGCTTTCATCAAGCACCCTGCGGACATCCCAGATTACTACAAACAGTCTTTCCCCGAGGGGTTCAAGTGGGAGCGCGTGATGAACTTCGAGGATGGCGGAGCCGTGACGGTCACCCAAGATACCTCTTTGGAGGACGGTACGTTGATCTACAAAGTGAAATTGCGTGGCACGAATTTTCCACCTGATGGGCCTGTCATGCAGAAAAAGACAATGGGATGGGAAGCTTCCACGGAGCGCCTTTACCCAGAGGACGGTGTTCTTAAAGGGGATATCAAAATGGCGCTGCGTCTTAAAGATGGAGGCCGCTACCTGGCGGACTTCAAGACTACTTACAAGGCCAAAAAACCAGTGCAGATGCCGGGTGCGTACAATGTAGATCGTAAATTAGATATTACAAGTCACAATGAAGATTACACGGTCGTAGAGCAGTATGAGCGCAGTGAGGGGCGTCACTCTACGGGCGGTATGGACGAGTTATACAAGTAAAGGTGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTTCGACCAAAGGCTCGGTACCAAATTCCAGAAAAGACACCCGAAAGGGTGTTTTTTCGTTTTGGTCCGCTTATGAAGGAGTTGACGGCTAGCTCAGTCCTAGGTACAGTGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAA",
        "CCATAGAGTCACACAGGAAAGTACT",
        "AATGAGTAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCAACATACGGAAAACTTACCCTTAAATTGATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTGGGTTATGGTGTTCAATGCTTTGCGAGATACCCAGATCATATGAAACAGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAAAGAACTATATTTTTCAAAGATGACGGGAACTACAAGACACGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATAGAATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTTGGACACAAATTGGAATACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCGAACTTCAAAATTAGACACAACATTGAAGATGGAGGTGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCCTACCAATCTAAGCTTTCGAAAGATCCCAACGAAAAGAGAGATCACATGGTCCTTCTTGAGTTTGTAACAGCTGCTGGGATTACACATGGCATGGATGAACTATACAAATAAGGTCCGGCTTATCGGTCAGTTTCACCTGATTTACGTAAAAACCCGCTTCGGCGGGTTTTTGCTTTTGGAGGGGCAGAAAGATGAATGACTGTCCACGACGCTATACCCAAAAGAAAAAAAAAAAACCCCGCCCCTGACAGGGCGGGGTTTTTTTTGCTTA",
    ),
    "pLIB432_B0033m": (
        "GCACTGAAGGTCCTCAATCGCACTGGAAACATCAAGGTCGACGAGGAGTTTACGGCTAGCTCAGTCCTAGGTATAGTGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAACCATAGAGAAAGAGGAGAAATACTAATGGTCAGTAAAGGCGAAGCAGTTATCAAAGAGTTCATGCGCTTCAAAGTTCATATGGAAGGGTCGATGAACGGGCACGAATTTGAAATTGAAGGCGAAGGCGAAGGCCGCCCATATGAAGGGACCCAAACCGCAAAGCTTAAGGTTACTAAAGGCGGTCCATTACCCTTTTCGTGGGACATTTTAAGCCCACAGTTTATGTACGGGAGTCGCGCTTTCATCAAGCACCCTGCGGACATCCCAGATTACTACAAACAGTCTTTCCCCGAGGGGTTCAAGTGGGAGCGCGTGATGAACTTCGAGGATGGCGGAGCCGTGACGGTCACCCAAGATACCTCTTTGGAGGACGGTACGTTGATCTACAAAGTGAAATTGCGTGGCACGAATTTTCCACCTGATGGGCCTGTCATGCAGAAAAAGACAATGGGATGGGAAGCTTCCACGGAGCGCCTTTACCCAGAGGACGGTGTTCTTAAAGGGGATATCAAAATGGCGCTGCGTCTTAAAGATGGAGGCCGCTACCTGGCGGACTTCAAGACTACTTACAAGGCCAAAAAACCAGTGCAGATGCCGGGTGCGTACAATGTAGATCGTAAATTAGATATTACAAGTCACAATGAAGATTACACGGTCGTAGAGCAGTATGAGCGCAGTGAGGGGCGTCACTCTACGGGCGGTATGGACGAGTTATACAAGTAAAGGTGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTTCGACCAAAGGCTCGGTACCAAATTCCAGAAAAGACACCCGAAAGGGTGTTTTTTCGTTTTGGTCCGCTTATGAAGGAGTTGACGGCTAGCTCAGTCCTAGGTACAGTGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAA",
        "CCATAGAGTCACACAGGACTACT",
        "AATGAGTAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCAACATACGGAAAACTTACCCTTAAATTGATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTGGGTTATGGTGTTCAATGCTTTGCGAGATACCCAGATCATATGAAACAGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAAAGAACTATATTTTTCAAAGATGACGGGAACTACAAGACACGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATAGAATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTTGGACACAAATTGGAATACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCGAACTTCAAAATTAGACACAACATTGAAGATGGAGGTGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCCTACCAATCTAAGCTTTCGAAAGATCCCAACGAAAAGAGAGATCACATGGTCCTTCTTGAGTTTGTAACAGCTGCTGGGATTACACATGGCATGGATGAACTATACAAATAAGGTCCGGCTTATCGGTCAGTTTCACCTGATTTACGTAAAAACCCGCTTCGGCGGGTTTTTGCTTTTGGAGGGGCAGAAAGATGAATGACTGTCCACGACGCTATACCCAAAAGAAAAAAAAAAAACCCCGCCCCTGACAGGGCGGGGTTTTTTTTGCTTA",
    ),
    "pLIB432_B0034m": (
        "GCACTGAAGGTCCTCAATCGCACTGGAAACATCAAGGTCGACGAGGAGTTTACGGCTAGCTCAGTCCTAGGTATAGTGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAACCATAGAGAAAGAGGAGAAATACTAATGGTCAGTAAAGGCGAAGCAGTTATCAAAGAGTTCATGCGCTTCAAAGTTCATATGGAAGGGTCGATGAACGGGCACGAATTTGAAATTGAAGGCGAAGGCGAAGGCCGCCCATATGAAGGGACCCAAACCGCAAAGCTTAAGGTTACTAAAGGCGGTCCATTACCCTTTTCGTGGGACATTTTAAGCCCACAGTTTATGTACGGGAGTCGCGCTTTCATCAAGCACCCTGCGGACATCCCAGATTACTACAAACAGTCTTTCCCCGAGGGGTTCAAGTGGGAGCGCGTGATGAACTTCGAGGATGGCGGAGCCGTGACGGTCACCCAAGATACCTCTTTGGAGGACGGTACGTTGATCTACAAAGTGAAATTGCGTGGCACGAATTTTCCACCTGATGGGCCTGTCATGCAGAAAAAGACAATGGGATGGGAAGCTTCCACGGAGCGCCTTTACCCAGAGGACGGTGTTCTTAAAGGGGATATCAAAATGGCGCTGCGTCTTAAAGATGGAGGCCGCTACCTGGCGGACTTCAAGACTACTTACAAGGCCAAAAAACCAGTGCAGATGCCGGGTGCGTACAATGTAGATCGTAAATTAGATATTACAAGTCACAATGAAGATTACACGGTCGTAGAGCAGTATGAGCGCAGTGAGGGGCGTCACTCTACGGGCGGTATGGACGAGTTATACAAGTAAAGGTGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTTCGACCAAAGGCTCGGTACCAAATTCCAGAAAAGACACCCGAAAGGGTGTTTTTTCGTTTTGGTCCGCTTATGAAGGAGTTGACGGCTAGCTCAGTCCTAGGTACAGTGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAA",
        "CCATAGAGAAAGAGGAGAAATACT",
        "AATGAGTAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCAACATACGGAAAACTTACCCTTAAATTGATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTGGGTTATGGTGTTCAATGCTTTGCGAGATACCCAGATCATATGAAACAGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAAAGAACTATATTTTTCAAAGATGACGGGAACTACAAGACACGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATAGAATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTTGGACACAAATTGGAATACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCGAACTTCAAAATTAGACACAACATTGAAGATGGAGGTGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCCTACCAATCTAAGCTTTCGAAAGATCCCAACGAAAAGAGAGATCACATGGTCCTTCTTGAGTTTGTAACAGCTGCTGGGATTACACATGGCATGGATGAACTATACAAATAAGGTCCGGCTTATCGGTCAGTTTCACCTGATTTACGTAAAAACCCGCTTCGGCGGGTTTTTGCTTTTGGAGGGGCAGAAAGATGAATGACTGTCCACGACGCTATACCCAAAAGAAAAAAAAAAAACCCCGCCCCTGACAGGGCGGGGTTTTTTTTGCTTA",
    ),
    "pLIB433": (
        "GCACTGAAGGTCCTCAATCGCACTGGAAACATCAAGGTCGACGAGGAGTTTACGGCTAGCTCAGTCCTAGGTATAGTGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAACCATAGAGAAAGAGGAGAAATACTAATGGTCAGTAAAGGCGAAGCAGTTATCAAAGAGTTCATGCGCTTCAAAGTTCATATGGAAGGGTCGATGAACGGGCACGAATTTGAAATTGAAGGCGAAGGCGAAGGCCGCCCATATGAAGGGACCCAAACCGCAAAGCTTAAGGTTACTAAAGGCGGTCCATTACCCTTTTCGTGGGACATTTTAAGCCCACAGTTTATGTACGGGAGTCGCGCTTTCATCAAGCACCCTGCGGACATCCCAGATTACTACAAACAGTCTTTCCCCGAGGGGTTCAAGTGGGAGCGCGTGATGAACTTCGAGGATGGCGGAGCCGTGACGGTCACCCAAGATACCTCTTTGGAGGACGGTACGTTGATCTACAAAGTGAAATTGCGTGGCACGAATTTTCCACCTGATGGGCCTGTCATGCAGAAAAAGACAATGGGATGGGAAGCTTCCACGGAGCGCCTTTACCCAGAGGACGGTGTTCTTAAAGGGGATATCAAAATGGCGCTGCGTCTTAAAGATGGAGGCCGCTACCTGGCGGACTTCAAGACTACTTACAAGGCCAAAAAACCAGTGCAGATGCCGGGTGCGTACAATGTAGATCGTAAATTAGATATTACAAGTCACAATGAAGATTACACGGTCGTAGAGCAGTATGAGCGCAGTGAGGGGCGTCACTCTACGGGCGGTATGGACGAGTTATACAAGTAAAGGTGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTTCGACCAAAGGCTCGGTACCAAATTCCAGAAAAGACACCCGAAAGGGTGTTTTTTCGTTTTGGTCCGCTTATGAAGGAGTTGACGGCTAGCTCAGTCCTAGGTACAGTGCTAGCTACTAGCTGTCACCGGATGTGCTTTCCGGTCTGATGAGTCCGTGAGGACGAAACAGCCTCTACAAATAATTTTGTTTAAGGGCCCAAGTTCACTTAAAAAGGAGATCAACAATGAAAGCAATTTTCGTACTGAAACATCTTAATCATGC",
        "NNNNNNNNN",
        "TTAAAAATGGCACGTACCCCGAGCCGTAGCAGCATTGGTAGCCTGCGTAGTCCGCATACCCATAAAGCAATTCTGACCAGCACCATTGAAATCCTGAAAGAATGTGGTTATAGCGGTCTGAGCATTGAAAGCGTGGCACGTCGCGCCGGTGCAGGCAAACCGACCATTTATCGTTGGTGGACCAACAAAGCAGCACTGATTGCCGAAGTGTATGAAAATGAAATCGAACAGGTACGTAAATTTCCGGATTTGGGTAGCTTTAAAGCCGATCTGGATTTTCTGCTGCATAATCTGTGGAAAGTTTGGCGTGAAACCATTTGTGGTGAAGCATTTCGTTGTGTTATTGCAGAAGCACAGTTGGACCCTGTAACCCTGACCCAACTGAAAGATCAGTTTATGGAACGTCGTCGTGAGATACCGAAAAAACTGGTTGAAGATGCCATTAGCAATGGTGAACTGCCGAAAGATATCAATCGTGAACTGCTGCTGGATATGATTTTTGGTTTTTGTTGGTATCGCCTGCTGACCGAACAGTTGACCGTTGAACAGGATATTGAAGAATTTACCTTCCTGCTGATTAATGGTGTTTGTCCGGGTACACAGTGTTAATAAGGTCCGGCAATTAAAAAAGCGGCTAACCACGCCGCTTTTTTTACGTCTGCACTCGGTACCAAATTCCAGAAAAGAGGCCTCCCGAAAGGGGGGCCTTTTTTCGTTTTGGTCCGCTTAAATAGGAGCGACGTACGGTGGAATCTGATTCGTTACCAATTGACATGATACGAAACGTACCGTATCGTTAAGGTTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAACCATAGAGTCACACAGGAAAGTACTAATGAGTAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCAACATACGGAAAACTTACCCTTAAATTGATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTGGGTTATGGTGTTCAATGCTTTGCGAGATACCCAGATCATATGAAACAGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAAAGAACTATATTTTTCAAAGATGACGGGAACTACAAGACACGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATAGAATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTTGGACACAAATTGGAATACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCGAACTTCAAAATTAGACACAACATTGAAGATGGAGGTGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCCTACCAATCTAAGCTTTCGAAAGATCCCAACGAAAAGAGAGATCACATGGTCCTTCTTGAGTTTGTAACAGCTGCTGGGATTACACATGGCATGGATGAACTATACAAATAAGGTCCGGCTTATCGGTCAGTTTCACCTGATTTACGTAAAAACCCGCTTCGGCGGGTTTTTGCTTTTGGAGGGGCAGAAAGATGAATGACTGTCCACGACGCTATACCCAAAAGAAAAAAAAAAAACCCCGCCCCTGACAGGGCGGGGTTTTTTTTGCTTA",
    ),
    "pLIB434": (
        "GCACTGAAGGTCCTCAATCGCACTGGAAACATCAAGGTCGACGAGGAGTTTACGGCTAGCTCAGTCCTAGGTATAGTGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAACCATAGAGAAAGAGGAGAAATACTAATGGTCAGTAAAGGCGAAGCAGTTATCAAAGAGTTCATGCGCTTCAAAGTTCATATGGAAGGGTCGATGAACGGGCACGAATTTGAAATTGAAGGCGAAGGCGAAGGCCGCCCATATGAAGGGACCCAAACCGCAAAGCTTAAGGTTACTAAAGGCGGTCCATTACCCTTTTCGTGGGACATTTTAAGCCCACAGTTTATGTACGGGAGTCGCGCTTTCATCAAGCACCCTGCGGACATCCCAGATTACTACAAACAGTCTTTCCCCGAGGGGTTCAAGTGGGAGCGCGTGATGAACTTCGAGGATGGCGGAGCCGTGACGGTCACCCAAGATACCTCTTTGGAGGACGGTACGTTGATCTACAAAGTGAAATTGCGTGGCACGAATTTTCCACCTGATGGGCCTGTCATGCAGAAAAAGACAATGGGATGGGAAGCTTCCACGGAGCGCCTTTACCCAGAGGACGGTGTTCTTAAAGGGGATATCAAAATGGCGCTGCGTCTTAAAGATGGAGGCCGCTACCTGGCGGACTTCAAGACTACTTACAAGGCCAAAAAACCAGTGCAGATGCCGGGTGCGTACAATGTAGATCGTAAATTAGATATTACAAGTCACAATGAAGATTACACGGTCGTAGAGCAGTATGAGCGCAGTGAGGGGCGTCACTCTACGGGCGGTATGGACGAGTTATACAAGTAAAGGTGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTTCGACCAAAGGCTCGGTACCAAATTCCAGAAAAGACACCCGAAAGGGTGTTTTTTCGTTTTGGTCCGCTTATGAAGGAGTTGACGGCTAGCTCAGTCCTAGGTACAGTGCTAGCTACTAGCTGTCACCGGATGTGCTTTCCGGTCTGATGAGTCCGTGAGGACGAAACAGCCTCTACAAATAATTTTGTTTAAGGGCCCAAGTTCACTTAAAAAGGAGATCAACAATGAAAGCAATTTTCGTACTGAAACATCTTAATCATGC",
        "NNNNNNNNN",
        "TTAAAAATGAAACCAGTAACGTTATACGATGTCGCAGAGTATGCCGGTGTCTCTTATATGACCGTTTCCCGCGTGGTGAACCAGGCCAGCCACGTTTCTGCGAAAACGCGGGAAAAAGTGGAAGCGGCGATGGTGGAGCTGAATTACATTCCCAACCGCGTGGCACAACAACTGGCGGGCAAACAGTCGTTGCTGATTGGCGTTGCCACCTCCAGTCTGGCCCTGCACGCGCCGTCGCAAATTGTCGCGGCGATTAAATCTCGCGCCGATCAACTGGGTGCCAGCGTGGTGGTGTCGATGGTAGAACGAAGCGGCGTCGAAGCCTGTAAAGCGGCGGTGCACAATCTTCTCGCGCAACGCGTCAGTGGGCTGATCATTAACTATCCGCTGGATGACCAGGATGCCATTGCTGTGGAAGCTGCCTGCACTAATGTTCCGGCGTTATTTCTTGATGTCTCTGACCAGACACCCATCAACAGTATTATTTACTCCCATGAGGACGGTACGCGACTGGGCGTGGAGCATCTGGTCGCATTGGGTCACCAGCAAATCGCGCTGTTAGCGGGCCCATTAAGTTCTGTCTCGGCGCGTCTGCGTCTGGCTGGCTGGCATAAATATCTCACTCGCAATCAAATTCAGCCGATAGCGGAACGGGAAGGCGACTGGAGTGCCATGTCCGGTTTTCAACAAACCATGCAAATGCTGAATGAGGGCATCGTTCCCACTGCGATGCTGGTTGCCAACGATCAGATGGCGCTGGGCGCAATGCGCGCCATTACCGAGTCCGGGCTGCGCGTTGGTGCGGATATCTCGGTAGTGGGATACGACGATACCGAAGATAGCTCATGTTATATCCCGCCGTTAACCACCATCAAACAGGATTTTCGCCTGCTGGGGCAAACCAGCGTGGACCGCTTGCTGCAACTCTCTCAGGGCCAGGCGGTGAAGGGCAATCAGCTGTTGCCAGTCTCACTGGTGAAAAGAAAAACCACCCTGGCGCCCAATACGCAAACCGCCTCTCCCCGCGCGTTGGCCGATTCATTAATGCAGCTGGCACGACAGGTTTCCCGACTGGAAAGCGGGCAGTAATAAGGTCCGGCAATTAAAAAAGCGGCTAACCACGCCGCTTTTTTTACGTCTGCACTCGGTACCAAATTCCAGAAAAGAGGCCTCCCGAAAGGGGGGCCTTTTTTCGTTTTGGTCCGCTTAAATAGGAGTGTTGACAATTAATCATCGGCTCGTATAATGTGTGGAATTGTGAGCGCTCACAATTTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAACCATAGAGTCACACAGGAAAGTACTAATGAGTAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCAACATACGGAAAACTTACCCTTAAATTGATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTGGGTTATGGTGTTCAATGCTTTGCGAGATACCCAGATCATATGAAACAGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAAAGAACTATATTTTTCAAAGATGACGGGAACTACAAGACACGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATAGAATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTTGGACACAAATTGGAATACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCGAACTTCAAAATTAGACACAACATTGAAGATGGAGGTGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCCTACCAATCTAAGCTTTCGAAAGATCCCAACGAAAAGAGAGATCACATGGTCCTTCTTGAGTTTGTAACAGCTGCTGGGATTACACATGGCATGGATGAACTATACAAATAAGGTCCGGCTTATCGGTCAGTTTCACCTGATTTACGTAAAAACCCGCTTCGGCGGGTTTTTGCTTTTGGAGGGGCAGAAAGATGAATGACTGTCCACGACGCTATACCCAAAAGAAAAAAAAAAAACCCCGCCCCTGACAGGGCGGGGTTTTTTTTGCTTA",
    ),
    "pLIB435": (
        "GCACTGAAGGTCCTCAATCGCACTGGAAACATCAAGGTCGACGAGGAGTTTACGGCTAGCTCAGTCCTAGGTATAGTGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAACCATAGAGAAAGAGGAGAAATACTAATGGTCAGTAAAGGCGAAGCAGTTATCAAAGAGTTCATGCGCTTCAAAGTTCATATGGAAGGGTCGATGAACGGGCACGAATTTGAAATTGAAGGCGAAGGCGAAGGCCGCCCATATGAAGGGACCCAAACCGCAAAGCTTAAGGTTACTAAAGGCGGTCCATTACCCTTTTCGTGGGACATTTTAAGCCCACAGTTTATGTACGGGAGTCGCGCTTTCATCAAGCACCCTGCGGACATCCCAGATTACTACAAACAGTCTTTCCCCGAGGGGTTCAAGTGGGAGCGCGTGATGAACTTCGAGGATGGCGGAGCCGTGACGGTCACCCAAGATACCTCTTTGGAGGACGGTACGTTGATCTACAAAGTGAAATTGCGTGGCACGAATTTTCCACCTGATGGGCCTGTCATGCAGAAAAAGACAATGGGATGGGAAGCTTCCACGGAGCGCCTTTACCCAGAGGACGGTGTTCTTAAAGGGGATATCAAAATGGCGCTGCGTCTTAAAGATGGAGGCCGCTACCTGGCGGACTTCAAGACTACTTACAAGGCCAAAAAACCAGTGCAGATGCCGGGTGCGTACAATGTAGATCGTAAATTAGATATTACAAGTCACAATGAAGATTACACGGTCGTAGAGCAGTATGAGCGCAGTGAGGGGCGTCACTCTACGGGCGGTATGGACGAGTTATACAAGTAAAGGTGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTTCGACCAAAGGCTCGGTACCAAATTCCAGAAAAGACACCCGAAAGGGTGTTTTTTCGTTTTGGTCCGCTTATGAAGGAGTTGACGGCTAGCTCAGTCCTAGGTACAGTGCTAGCTACTAGCTGTCACCGGATGTGCTTTCCGGTCTGATGAGTCCGTGAGGACGAAACAGCCTCTACAAATAATTTTGTTTAAGGGCCCAAGTTCACTTAAAAAGGAGATCAACAATGAAAGCAATTTTCGTACTGAAACATCTTAATCATGC",
        "NNNNNNNNN",
        "TTAAAAATGCCGAAACTGGGTATGCAGAGCATTCGTCGTCGTCAGCTGATTGATGCAACCCTGGAAGCAATTAATGAAGTTGGTATGCATGATGCAACCATTGCACAGATTGCACGTCGTGCCGGTGTTAGCACCGGTATTATTAGCCATTATTTCCGCGATAAAAACGGTCTACTGGAAGCAACCATGCGTGATATTACCAGCCAGCTGCGTGATGCAGTTCTGAATCGTCTGCATGCACTGCCGCAGGGTAGCGCAGAACAGCGTCTGCAGGCAATTGTTGGTGGTAATTTTGATGAAACCCAGGTTAGCAGCGCAGCAATGAAAGCATGGCTGGCATTTTGGGCAATCAGCATGCATCAGCCGATGCTGTATCGTCTGCAGCAGGTTAGCAGTCGTCGTCTGCTGAGCAATCTGGTTAGCGAATTTCGTCGTGAACTGCCTCGTGAACAGGCACAAGAGGCAGGTTATGGTCTGGCAGCACTGATTGATGGTCTGTGGCTGCGTGCAGCACTGAGCGGTAAACCGCTGGATAAAACCCGTGCAAATAGCCTGACCCGTCATTTTATCACCCAGCATCTGCCGACCGATTAATAAGGTCCGGCAATTAAAAAAGCGGCTAACCACGCCGCTTTTTTTACGTCTGCACTCGGTACCAAATTCCAGAAAAGAGGCCTCCCGAAAGGGGGGCCTTTTTTCGTTTTGGTCCGCTTAAATAGGAGAGCGCGGGTGAGAGGGATTCGTTACCAATAGACAATTGATTGGACGTTCAATATAATGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAACCATAGAGTCACACAGGAAAGTACTAATGAGTAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCAACATACGGAAAACTTACCCTTAAATTGATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTGGGTTATGGTGTTCAATGCTTTGCGAGATACCCAGATCATATGAAACAGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAAAGAACTATATTTTTCAAAGATGACGGGAACTACAAGACACGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATAGAATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTTGGACACAAATTGGAATACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCGAACTTCAAAATTAGACACAACATTGAAGATGGAGGTGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCCTACCAATCTAAGCTTTCGAAAGATCCCAACGAAAAGAGAGATCACATGGTCCTTCTTGAGTTTGTAACAGCTGCTGGGATTACACATGGCATGGATGAACTATACAAATAAGGTCCGGCTTATCGGTCAGTTTCACCTGATTTACGTAAAAACCCGCTTCGGCGGGTTTTTGCTTTTGGAGGGGCAGAAAGATGAATGACTGTCCACGACGCTATACCCAAAAGAAAAAAAAAAAACCCCGCCCCTGACAGGGCGGGGTTTTTTTTGCTTA",
    ),
}

In [None]:
parasail_kwargs = {
    "parasail_algorithm": "sw",
    "degenerate": True,
    "match": 1,
    "mismatch": -1,
    "gap_opening": 3,
    "gap_extension": 1,
}

In [None]:
# ipc_filename = "/home/jqs1/scratch/sequencing/231130_bcd_rbses_run3_minion/20231130_1904_MN35044_FAX40126_ee95ee31/consensus/consensus-0-of-20.arrow"
ipc_filename = "/home/jqs1/scratch/sequencing/231201_bcd_rbses_run3/20231201_1101_1F_PAU05823_773c75ee/consensus/consensus-0-of-200.arrow"
df = pl.read_ipc(ipc_filename)

In [None]:
df

In [None]:
%%time
df_idx = df
freqs = Counter()
idxs = defaultdict(list)
cigars = defaultdict(list)
num = len(df_idx)
num = 100
for idx in trange(num):
    seq = df_idx[idx, "consensus_seq"]
    # path = df_idx[idx, "variants_path"]
    alignments = {}
    for name, segments in refs.items():
        ref_seq = reverse_complement("".join(segments))
        score, cigar = align.pairwise_align(seq, ref_seq, **parasail_kwargs)
        score -= len(ref_seq)
        alignments[name] = (score, cigar)
    sorted_alignments = sorted(alignments.items(), key=lambda x: x[1][0], reverse=True)
    best_name = sorted_alignments[0][0]
    best_cigar = sorted_alignments[0][1]
    freqs[best_name] += 1
    idxs[best_name].append(idx)
    cigars[best_name].append(best_cigar)

In [None]:
freqs

## GraphAligner vs. Pairwise

In [None]:
ipc_filename = "/home/jqs1/scratch/sequencing/231201_bcd_rbses_run3/20231201_1101_1F_PAU05823_773c75ee/consensus/consensus-0-of-200.arrow"
df = pl.read_ipc(ipc_filename)
len(df)

In [None]:
%%time
num = len(df)
# num = 100
pairwise = {}
for idx in trange(num):
    name = df[idx, "name"]
    seq = df[idx, "consensus_seq"]
    alignments = {}
    for ref_name, segments in refs.items():
        ref_seq = reverse_complement("".join(segments))
        score, cigar = align.pairwise_align(seq, ref_seq, **parasail_kwargs)
        score -= len(ref_seq)
        alignments[ref_name] = (score, cigar)
    sorted_alignments = sorted(alignments.items(), key=lambda x: x[1][0], reverse=True)
    # best_ref_name = sorted_alignments[0][0]
    # best_cigar = sorted_alignments[0][1]
    pairwise[name] = sorted_alignments

In [None]:
variant_segments = set(
    [
        "pLIB435:BetI_pBetI",
        "pLIB434:LacI_pTac",
        "pLIB433:PhlF_pPhlF",
        "pLIB431-432:RBS=B0032m",
        "pLIB431-432:RBS=B0033m",
        "pLIB431-432:RBS=B0034m",
        "pLIB431-432:RBS=StrongRBSLib",
        # "BCD_RBS:RBS",
    ]
)

In [None]:
ipc_filename = "/home/jqs1/scratch/sequencing/231201_bcd_rbses_run3/20231201_1101_1F_PAU05823_773c75ee/_temp/vg2.arrow"
gaf_df = pl.read_ipc(ipc_filename)

In [None]:
gaf_df2 = gaf_df.with_columns(
    dup=pl.col("name").is_duplicated(),
    e2e=pl.col("variants_path")
    .list.set_intersection(
        [
            "<BC:UPSTREAM",
            "<pLIB430-435:upstream",
            ">BC:UPSTREAM",
            ">pLIB430-435:upstream",
        ]
    )
    .list.len()
    == 2,
    bc_e2e=pl.col("variants_path")
    .list.set_intersection(
        [
            "<BC:UPSTREAM",
            "<BC:SPACER2",
            ">BC:UPSTREAM",
            ">BC:SPACER2",
        ]
    )
    .list.len()
    == 2,
)

In [None]:
gaf_df2["bc_e2e"].sum()

In [None]:
gaf_df2["e2e"].sum()

In [None]:
gaf_df2["dup"].sum()

In [None]:
gaf_df2.filter(pl.col("dup").not_())[
    ["query_length", "dup", "e2e", "bc_e2e"]
].to_pandas().hvplot.hist(
    "query_length",
    bins=100,
    by="bc_e2e",
    subplots=True,
    width=400,
)

In [None]:
gaf_df2.filter(pl.col("dup").not_())[
    ["query_length", "dup", "e2e", "bc_e2e"]
].to_pandas().hvplot.hist(
    "query_length",
    bins=100,
    by="e2e",
    subplots=True,
    width=400,
)

In [None]:
gaf_df2.filter(pl.col("dup"))[
    ["query_length", "dup", "e2e", "bc_e2e"]
].to_pandas().hvplot.hist(
    "query_length",
    bins=100,
    by="bc_e2e",
    subplots=True,
    width=400,
)

In [None]:
gaf_df2.filter(pl.col("dup"))[
    ["query_length", "dup", "e2e", "bc_e2e"]
].to_pandas().hvplot.hist(
    "query_length",
    bins=100,
    by="e2e",
    subplots=True,
    width=400,
)

In [None]:
gaf_df2[["query_length", "dup", "e2e", "bc_e2e"]].to_pandas().hvplot.hist(
    "query_length",
    bins=100,
    by="e2e",
    subplots=True,
    width=400,
)

In [None]:
gaf_df2.filter(pl.col("depth") > 5)[
    ["query_length", "dup", "e2e", "bc_e2e"]
].to_pandas().hvplot.hist(
    "query_length",
    bins=100,
    by="e2e",
    subplots=True,
    width=400,
)

In [None]:
%%time
gaf_df_ = gaf_df2.filter(pl.col("e2e"), pl.col("duplex_depth") > 0, pl.col("NM") < 50)
num = len(gaf_df_)
# num = 20
graphaligner = {}
for idx in trange(num):
    name = gaf_df_["name"][idx]
    path = [s[1:] for s in gaf_df_["variants_path"][idx]]
    variant = set(path) & variant_segments
    if not variant and "BCD_RBS:RBS" in path:
        variant = ["BCD_RBS:RBS"]
    if not variant:
        variant = None
    else:
        variant = variant.pop()
    graphaligner[name] = (variant, path)

In [None]:
len(graphaligner)

In [None]:
len(pairwise)

In [None]:
len(graphaligner.keys() & pairwise.keys())

In [None]:
freqs = Counter()
names = defaultdict(list)
for name in graphaligner.keys() & pairwise.keys():
    key = (pairwise[name][0][0], graphaligner[name][0])
    freqs[key] += 1
    names[key].append(name)

In [None]:
freqs

In [None]:
# GA is calling pLIB432_B0032m as BCD_RBS:RBS

In [None]:
n = names[("pLIB433", "pLIB433:PhlF_pPhlF")][3]
seq = gaf_df2.filter(pl.col("name") == n)
seq["consensus_seq"][0]

In [None]:
off_diag = [
    ("pLIB432_B0032m", "BCD_RBS:RBS"),
    ("pLIB432_B0032m", "pLIB435:BetI_pBetI"),
    ("pLIB432_B0032m", "pLIB433:PhlF_pPhlF"),
    ("pLIB432_B0032m", "pLIB434:LacI_pTac"),
]

In [None]:
names[off_diag[2]]

In [None]:
names[off_diag[0]]

In [None]:
gaf_df2.filter(pl.col("e2e"))["NM"].to_pandas().hvplot.hist(bins=100)

In [None]:
gaf_df2.filter(pl.col("e2e"))["id"].to_pandas().hvplot.hist(bins=100)

In [None]:
seq = gaf_df2.filter(pl.col("name") == names[off_diag[2]][0])
seq

In [None]:
seq["cg"][0]

In [None]:
seq["consensus_seq"][0]

## Freq distribution

In [None]:
variant_segments = set(
    [
        "pLIB435:BetI_pBetI",
        "pLIB434:LacI_pTac",
        "pLIB433:PhlF_pPhlF",
        "pLIB431-432:RBS=B0032m",
        "pLIB431-432:RBS=B0033m",
        "pLIB431-432:RBS=B0034m",
        "pLIB431-432:RBS=StrongRBSLib",
        # "BCD_RBS:RBS",
    ]
)
promoter_segments = set(
    [
        "pLIB430-435:promoter=J23100",
        "pLIB430-435:promoter=J23103",
        "pLIB430-435:promoter=J23106",
        "pLIB430-435:promoter=J23116",
    ]
)

In [None]:
set.

In [None]:
%%time
gaf_df_ = gaf_df2.filter(
    pl.col("e2e")
)  # , pl.col("duplex_depth") > 0, pl.col("NM") < 50)
num = len(gaf_df_)
# num = 20
freqs = Counter()
for idx in trange(num):
    name = gaf_df_["name"][idx]
    path = set([s[1:] for s in gaf_df_["variants_path"][idx]])
    # variant = tuple(sorted(set(path) & variant_segments))
    rbs = path & variant_segments
    if rbs:
        rbs = rbs.pop()
    else:
        rbs = "BCD"
    prom = (path & promoter_segments).pop()
    variant = (rbs, prom)
    freqs[variant] += 1
    # if not variant and "BCD_RBS:RBS" in path:
    #     variant = ["BCD_RBS:RBS"]
    # if not variant:
    #     variant = None
    # else:
    #     variant = variant.pop()
    # graphaligner[name] = (variant, path)

In [None]:
pd.DataFrame(freqs.values(), index=pd.MultiIndex.from_tuples(freqs.keys())).unstack(1)

# 231130_bcd_rbses_run3_minion

In [None]:
df = pl.read_ipc(
    "/home/jqs1/scratch/sequencing/231130_bcd_rbses_run3_minion/20231130_1904_MN35044_FAX40126_ee95ee31/extract_segments/*.arrow"
)

In [None]:
df2 = df.with_columns(
    dup=pl.col("name").is_duplicated(),
    e2e=pl.col("variants_path")
    .list.set_intersection(
        [
            "<BC:UPSTREAM",
            "<pLIB430-435:upstream",
            ">BC:UPSTREAM",
            ">pLIB430-435:upstream",
        ]
    )
    .list.len()
    == 2,
    bc_e2e=pl.col("variants_path")
    .list.set_intersection(
        [
            "<BC:UPSTREAM",
            "<BC:SPACER2",
            ">BC:UPSTREAM",
            ">BC:SPACER2",
        ]
    )
    .list.len()
    == 2,
)

In [None]:
len(df)

In [None]:
def label_columns(cols, func=None):
    expr = None
    for col in cols:
        if expr is None:
            expr = pl.when(pl.col(col).is_not_null())
        else:
            expr = expr.when(pl.col(col).is_not_null())
        if func is not None:
            lit = func(col)
        else:
            lit = col
        expr = expr.then(pl.lit(lit))
    return expr

In [None]:
df2.filter(pl.col("e2e"), ~pl.col("dup")).with_columns(
    pl.coalesce(
        label_columns(
            [
                "pLIB433:PhlF_pPhlF|seq",
                "pLIB434:LacI_pTac|seq",
                "pLIB435:BetI_pBetI|seq",
            ],
            lambda x: x.split("|")[0],
        ),
        pl.concat_str(pl.lit("pLIB431-432:RBS="), pl.col("pLIB431-432:RBS|variant")),
    ).alias("RBS")
)[["RBS", "pLIB430-435:promoter|variant"]].select(pl.struct(pl.all()).alias("foo"))[
    "foo"
].value_counts()

In [None]:
df_variants = (
    df2.filter(pl.col("e2e"), ~pl.col("dup"))
    .with_columns(
        pl.coalesce(
            label_columns(
                [
                    "pLIB433:PhlF_pPhlF|seq",
                    "pLIB434:LacI_pTac|seq",
                    "pLIB435:BetI_pBetI|seq",
                ],
                lambda x: x.split("|")[0],
            ),
            pl.concat_str(
                pl.lit("pLIB431-432:RBS="), pl.col("pLIB431-432:RBS|variant")
            ),
        ).alias("RBS")
    )[["RBS", "pLIB430-435:promoter|variant"]]
    .to_pandas()
)

In [None]:
df_variants.value_counts().unstack(1)

# 230707_repressilators

In [None]:
df = pl.read_ipc(
    "/home/jqs1/scratch/sequencing/230707_repressilators/20230707_2040_MN35044_FAS94231_25542e0d/extract_segments/*.arrow"
)

In [None]:
df2 = df.with_columns(
    dup=pl.col("name").is_duplicated(),
    e2e=pl.col("variants_path")
    .list.set_intersection(
        [
            "<BC:UPSTREAM",
            "<UNS3",
            ">BC:UPSTREAM",
            ">UNS3",
        ]
    )
    .list.len()
    == 2,
    bc_e2e=pl.col("variants_path")
    .list.set_intersection(
        [
            "<BC:UPSTREAM",
            "<BC:SPACER2",
            ">BC:UPSTREAM",
            ">BC:SPACER2",
        ]
    )
    .list.len()
    == 2,
)

In [None]:
df2.filter(pl.col("e2e"), ~pl.col("dup"))[
    ["RBS1:RBS|seq", "RBS2:RBS|seq", "RBS3:RBS|seq"]
]

In [None]:
df2.filter(pl.col("e2e"), ~pl.col("dup"))[
    ["RBS1:RBS|seq", "RBS2:RBS|seq", "RBS3:RBS|seq"]
].select(pl.struct(pl.all()).alias("variant")).to_series().value_counts()

In [None]:
df2.filter(pl.col("e2e"), ~pl.col("dup"), pl.col("depth") > 5)[
    "RBS1:RBS|seq"
].value_counts().filter(pl.col("count") > 20).sort("count", descending=True)

In [None]:
hv.Overlay(
    [
        df2.filter(pl.col("e2e"), ~pl.col("dup"))[col]
        .value_counts()
        .sort("count", descending=True)["count"]
        .to_pandas()
        .hvplot.step(logy=True, label=col)
        for col in ["RBS1:RBS|seq", "RBS2:RBS|seq", "RBS3:RBS|seq"]
    ]
)

In [None]:
df2.filter(pl.col("e2e"), ~pl.col("dup"))[["RBS1:RBS|seq", "RBS2:RBS|seq"]].select(
    pl.struct(pl.all()).alias("variant")
).to_series().value_counts().sort("count", descending=True)

In [None]:
df2.filter(pl.col("e2e"), ~pl.col("dup"))[["RBS2:RBS|seq", "RBS3:RBS|seq"]].select(
    pl.struct(pl.all())
).to_series().value_counts().sort("count", descending=True)

In [None]:
df2.filter(pl.col("e2e"), ~pl.col("dup"))[
    ["RBS1:RBS|seq", "RBS2:RBS|seq", "RBS3:RBS|seq"]
].select(pl.struct(pl.all())).to_series().value_counts().sort("count", descending=True)

In [None]:
df2.filter(pl.col("e2e"), ~pl.col("dup"))[
    ["RBS1:RBS|seq", "RBS2:RBS|seq", "RBS3:RBS|seq"]
].select(pl.struct(pl.all())).to_series().value_counts().sort("count", descending=True)[
    "count"
].to_pandas().hvplot.step()

In [None]:
hv.Overlay(
    [
        df2.filter(pl.col("e2e"), ~pl.col("dup"))[list(cols)]
        .select(pl.struct(pl.all()))
        .to_series()
        .value_counts()
        .sort("count", descending=True)["count"]
        .to_pandas()
        .hvplot.step(label=", ".join(cols))
        for cols in it.combinations(["RBS1:RBS|seq", "RBS2:RBS|seq", "RBS3:RBS|seq"], 2)
    ]
)

In [None]:
col1 = "RBS1:RBS|seq"
# col1 = "RBS2:RBS|seq"
col2 = "RBS2:RBS|seq"
# col2 = "RBS3:RBS|seq"
df2_filtered = df2.filter(pl.col("e2e"), ~pl.col("dup"))
df_joined = (
    df2_filtered[col1]
    .value_counts()
    .filter(pl.col("count") > 100)
    .join(df2_filtered[col2].value_counts(), left_on=col1, right_on=col2)
)
r2 = np.corrcoef(df_joined["count"], df_joined["count_right"])[0, 1] ** 2
df_joined[["count", "count_right"]].to_pandas().hvplot.scatter(
    "count", "count_right", title=f"r^2: {r2:.2f}"
)