In [None]:
import itertools as it
import operator
import re
import subprocess
import tempfile
import time
from collections import Counter, defaultdict
from functools import partial
from pathlib import Path

import duckdb
import gfapy
import holoviews as hv
import ibis
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
import parasail
import polars as pl
import pyabpoa
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.dataset as ds
import pyfastx
import pysam
import spoa
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from pyarrow import csv
from pywfa import WavefrontAligner
from tqdm.auto import tqdm, trange

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import paulssonlab.sequencing.align as align
import paulssonlab.sequencing.cigar as scigar
import paulssonlab.sequencing.consensus as con
import paulssonlab.sequencing.gfa as sgfa
import paulssonlab.sequencing.io as sio
import paulssonlab.sequencing.processing as processing
from paulssonlab.util.sequence import reverse_complement

In [None]:
hv.extension("bokeh")

In [None]:
%load_ext pyinstrument
import line_profiler
import pyinstrument

%load_ext line_profiler

In [None]:
pl.enable_string_cache()

# Config

In [None]:
gfa_filename = "/home/jqs1/scratch/jqs1/sequencing/230930_alignment_test/230707_repressilators/pLIB419.gfa"

In [None]:
gfa = gfapy.Gfa.from_file(gfa_filename)

# 230707_repressilators

In [None]:
%%time
filename = "/home/jqs1/scratch/jqs1/sequencing/230707_repressilators/20230707_2040_MN35044_FAS94231_25542e0d/extract_segments/*.arrow"
df = pl.read_ipc(filename)
len(df)

In [None]:
%%time
df2 = df.filter(pl.col("name").is_first_distinct())
len(df2)

In [None]:
%%time
df3 = df2.filter(pl.col("depth") > 20)
len(df3)

In [None]:
%%time
df4 = df3.filter(
    (pl.col("consensus_path").list.get(0) == "<UNS9")
    & (pl.col("consensus_path").list.get(-1) == "<UNS3")
)
len(df4)

In [None]:
counts = df4.get_column("RBS1|seq").value_counts().filter(pl.col("counts") > 10)

In [None]:
counts

In [None]:
df4.get_column("RBS3|seq").value_counts().filter(pl.col("counts") > 10)

# 230922_bcd_rbses

In [None]:
%%time
# filename = "/home/jqs1/scratch/jqs1/sequencing/230707_repressilators/20230707_2040_MN35044_FAS94231_25542e0d/extract_segments/*.arrow"
filename = "/home/jqs1/scratch/jqs1/sequencing/230922_bcd_rbses_constitutive/20230922_1104_1A_PAQ83451_8d610a8c/extract_segments/*.arrow"
df = pl.read_ipc(filename)
len(df)

In [None]:
df.estimated_size(unit="mb")

In [None]:
%%time
df2 = df.filter(pl.col("name").is_first_distinct())
len(df2)

In [None]:
%%time
df3 = df2.filter(pl.col("depth") > 20)
len(df3)

In [None]:
%%time
df4 = df3.filter(
    (pl.col("variants_path").list.get(0) == "<UNS9")
    & (pl.col("variants_path").list.get(-1) == "<UNS3")
)
len(df4)

In [None]:
df4["realign_cg"][:10].to_list()

In [None]:
df4["pLIB431-432:RBS|variant"].value_counts()

In [None]:
df4["pLIB431-432:RBS|seq"].value_counts().sort("counts", descending=True)

In [None]:
df4["pLIB431-432:RBS|cigar"].value_counts().sort("counts", descending=True)

In [None]:
df4["pLIB431-432:RBS|seq"].str.len_bytes().value_counts()

In [None]:
df4["pLIB431-432:RBS|seq"].value_counts().sort("counts", descending=True)

In [None]:
df4.columns

In [None]:
df4.filter(pl.col("BCD_RBS:RBS|seq").str.len_bytes() > 15)  # .get_column("depth")

In [None]:
df4.get_column("BCD_RBS:RBS|seq").value_counts().sort("counts", descending=True)

In [None]:
df4.get_column("pLIB433:PhlF_pPhlF|seq").value_counts().sort("counts", descending=True)

In [None]:
df4.get_column("pLIB434:LacI_pTac|seq").value_counts().sort("counts", descending=True)

In [None]:
df4.get_column("pLIB435:BetI_pBetI|seq").value_counts().sort("counts", descending=True)

In [None]:
df.filter(pl.col("depth") > 10)

## Pairwise align

In [None]:
refs = {
    "pLIB430": (
        "GCACTGAAGGTCCTCAATCGCACTGGAAACATCAAGGTCGACGAGGAGTTTACGGCTAGCTCAGTCCTAGGTATAGTGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAACCATAGAGAAAGAGGAGAAATACTAATGGTCAGTAAAGGCGAAGCAGTTATCAAAGAGTTCATGCGCTTCAAAGTTCATATGGAAGGGTCGATGAACGGGCACGAATTTGAAATTGAAGGCGAAGGCGAAGGCCGCCCATATGAAGGGACCCAAACCGCAAAGCTTAAGGTTACTAAAGGCGGTCCATTACCCTTTTCGTGGGACATTTTAAGCCCACAGTTTATGTACGGGAGTCGCGCTTTCATCAAGCACCCTGCGGACATCCCAGATTACTACAAACAGTCTTTCCCCGAGGGGTTCAAGTGGGAGCGCGTGATGAACTTCGAGGATGGCGGAGCCGTGACGGTCACCCAAGATACCTCTTTGGAGGACGGTACGTTGATCTACAAAGTGAAATTGCGTGGCACGAATTTTCCACCTGATGGGCCTGTCATGCAGAAAAAGACAATGGGATGGGAAGCTTCCACGGAGCGCCTTTACCCAGAGGACGGTGTTCTTAAAGGGGATATCAAAATGGCGCTGCGTCTTAAAGATGGAGGCCGCTACCTGGCGGACTTCAAGACTACTTACAAGGCCAAAAAACCAGTGCAGATGCCGGGTGCGTACAATGTAGATCGTAAATTAGATATTACAAGTCACAATGAAGATTACACGGTCGTAGAGCAGTATGAGCGCAGTGAGGGGCGTCACTCTACGGGCGGTATGGACGAGTTATACAAGTAAAGGTGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTTCGACCAAAGGCTCGGTACCAAATTCCAGAAAAGACACCCGAAAGGGTGTTTTTTCGTTTTGGTCCGCTTATGAAGGAGTTGACGGCTAGCTCAGTCCTAGGTACAGTGCTAGCTACTAGCTGTCACCGGATGTGCTTTCCGGTCTGATGAGTCCGTGAGGACGAAACAGCCTCTACAAATAATTTTGTTTAAGGGCCCAAGTTCACTTAAAAAGGAGATCAACAATGAAAGCAATTTTCGTACTGAAACATCTTAATCATGC",
        "NNNNNNNNN",
        "TTAAAAATGAGTAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCAACATACGGAAAACTTACCCTTAAATTGATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTGGGTTATGGTGTTCAATGCTTTGCGAGATACCCAGATCATATGAAACAGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAAAGAACTATATTTTTCAAAGATGACGGGAACTACAAGACACGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATAGAATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTTGGACACAAATTGGAATACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCGAACTTCAAAATTAGACACAACATTGAAGATGGAGGTGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCCTACCAATCTAAGCTTTCGAAAGATCCCAACGAAAAGAGAGATCACATGGTCCTTCTTGAGTTTGTAACAGCTGCTGGGATTACACATGGCATGGATGAACTATACAAATAAGGTCCGGCTTATCGGTCAGTTTCACCTGATTTACGTAAAAACCCGCTTCGGCGGGTTTTTGCTTTTGGAGGGGCAGAAAGATGAATGACTGTCCACGACGCTATACCCAAAAGAAAAAAAAAAAACCCCGCCCCTGACAGGGCGGGGTTTTTTTTGCTTA",
    ),
    "pLIB431": (
        "GCACTGAAGGTCCTCAATCGCACTGGAAACATCAAGGTCGACGAGGAGTTTACGGCTAGCTCAGTCCTAGGTATAGTGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAACCATAGAGAAAGAGGAGAAATACTAATGGTCAGTAAAGGCGAAGCAGTTATCAAAGAGTTCATGCGCTTCAAAGTTCATATGGAAGGGTCGATGAACGGGCACGAATTTGAAATTGAAGGCGAAGGCGAAGGCCGCCCATATGAAGGGACCCAAACCGCAAAGCTTAAGGTTACTAAAGGCGGTCCATTACCCTTTTCGTGGGACATTTTAAGCCCACAGTTTATGTACGGGAGTCGCGCTTTCATCAAGCACCCTGCGGACATCCCAGATTACTACAAACAGTCTTTCCCCGAGGGGTTCAAGTGGGAGCGCGTGATGAACTTCGAGGATGGCGGAGCCGTGACGGTCACCCAAGATACCTCTTTGGAGGACGGTACGTTGATCTACAAAGTGAAATTGCGTGGCACGAATTTTCCACCTGATGGGCCTGTCATGCAGAAAAAGACAATGGGATGGGAAGCTTCCACGGAGCGCCTTTACCCAGAGGACGGTGTTCTTAAAGGGGATATCAAAATGGCGCTGCGTCTTAAAGATGGAGGCCGCTACCTGGCGGACTTCAAGACTACTTACAAGGCCAAAAAACCAGTGCAGATGCCGGGTGCGTACAATGTAGATCGTAAATTAGATATTACAAGTCACAATGAAGATTACACGGTCGTAGAGCAGTATGAGCGCAGTGAGGGGCGTCACTCTACGGGCGGTATGGACGAGTTATACAAGTAAAGGTGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTTCGACCAAAGGCTCGGTACCAAATTCCAGAAAAGACACCCGAAAGGGTGTTTTTTCGTTTTGGTCCGCTTATGAAGGAGTTGACGGCTAGCTCAGTCCTAGGTACAGTGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAACCAT",
        "AACAWAGAAAGGGGGTHHHCK",
        "AATGAGTAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCAACATACGGAAAACTTACCCTTAAATTGATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTGGGTTATGGTGTTCAATGCTTTGCGAGATACCCAGATCATATGAAACAGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAAAGAACTATATTTTTCAAAGATGACGGGAACTACAAGACACGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATAGAATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTTGGACACAAATTGGAATACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCGAACTTCAAAATTAGACACAACATTGAAGATGGAGGTGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCCTACCAATCTAAGCTTTCGAAAGATCCCAACGAAAAGAGAGATCACATGGTCCTTCTTGAGTTTGTAACAGCTGCTGGGATTACACATGGCATGGATGAACTATACAAATAAGGTCCGGCTTATCGGTCAGTTTCACCTGATTTACGTAAAAACCCGCTTCGGCGGGTTTTTGCTTTTGGAGGGGCAGAAAGATGAATGACTGTCCACGACGCTATACCCAAAAGAAAAAAAAAAAACCCCGCCCCTGACAGGGCGGGGTTTTTTTTGCTTA",
    ),
    "pLIB432_B0032m": (
        "GCACTGAAGGTCCTCAATCGCACTGGAAACATCAAGGTCGACGAGGAGTTTACGGCTAGCTCAGTCCTAGGTATAGTGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAACCATAGAGAAAGAGGAGAAATACTAATGGTCAGTAAAGGCGAAGCAGTTATCAAAGAGTTCATGCGCTTCAAAGTTCATATGGAAGGGTCGATGAACGGGCACGAATTTGAAATTGAAGGCGAAGGCGAAGGCCGCCCATATGAAGGGACCCAAACCGCAAAGCTTAAGGTTACTAAAGGCGGTCCATTACCCTTTTCGTGGGACATTTTAAGCCCACAGTTTATGTACGGGAGTCGCGCTTTCATCAAGCACCCTGCGGACATCCCAGATTACTACAAACAGTCTTTCCCCGAGGGGTTCAAGTGGGAGCGCGTGATGAACTTCGAGGATGGCGGAGCCGTGACGGTCACCCAAGATACCTCTTTGGAGGACGGTACGTTGATCTACAAAGTGAAATTGCGTGGCACGAATTTTCCACCTGATGGGCCTGTCATGCAGAAAAAGACAATGGGATGGGAAGCTTCCACGGAGCGCCTTTACCCAGAGGACGGTGTTCTTAAAGGGGATATCAAAATGGCGCTGCGTCTTAAAGATGGAGGCCGCTACCTGGCGGACTTCAAGACTACTTACAAGGCCAAAAAACCAGTGCAGATGCCGGGTGCGTACAATGTAGATCGTAAATTAGATATTACAAGTCACAATGAAGATTACACGGTCGTAGAGCAGTATGAGCGCAGTGAGGGGCGTCACTCTACGGGCGGTATGGACGAGTTATACAAGTAAAGGTGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTTCGACCAAAGGCTCGGTACCAAATTCCAGAAAAGACACCCGAAAGGGTGTTTTTTCGTTTTGGTCCGCTTATGAAGGAGTTGACGGCTAGCTCAGTCCTAGGTACAGTGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAA",
        "CCATAGAGTCACACAGGAAAGTACT",
        "AATGAGTAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCAACATACGGAAAACTTACCCTTAAATTGATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTGGGTTATGGTGTTCAATGCTTTGCGAGATACCCAGATCATATGAAACAGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAAAGAACTATATTTTTCAAAGATGACGGGAACTACAAGACACGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATAGAATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTTGGACACAAATTGGAATACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCGAACTTCAAAATTAGACACAACATTGAAGATGGAGGTGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCCTACCAATCTAAGCTTTCGAAAGATCCCAACGAAAAGAGAGATCACATGGTCCTTCTTGAGTTTGTAACAGCTGCTGGGATTACACATGGCATGGATGAACTATACAAATAAGGTCCGGCTTATCGGTCAGTTTCACCTGATTTACGTAAAAACCCGCTTCGGCGGGTTTTTGCTTTTGGAGGGGCAGAAAGATGAATGACTGTCCACGACGCTATACCCAAAAGAAAAAAAAAAAACCCCGCCCCTGACAGGGCGGGGTTTTTTTTGCTTA",
    ),
    "pLIB432_B0033m": (
        "GCACTGAAGGTCCTCAATCGCACTGGAAACATCAAGGTCGACGAGGAGTTTACGGCTAGCTCAGTCCTAGGTATAGTGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAACCATAGAGAAAGAGGAGAAATACTAATGGTCAGTAAAGGCGAAGCAGTTATCAAAGAGTTCATGCGCTTCAAAGTTCATATGGAAGGGTCGATGAACGGGCACGAATTTGAAATTGAAGGCGAAGGCGAAGGCCGCCCATATGAAGGGACCCAAACCGCAAAGCTTAAGGTTACTAAAGGCGGTCCATTACCCTTTTCGTGGGACATTTTAAGCCCACAGTTTATGTACGGGAGTCGCGCTTTCATCAAGCACCCTGCGGACATCCCAGATTACTACAAACAGTCTTTCCCCGAGGGGTTCAAGTGGGAGCGCGTGATGAACTTCGAGGATGGCGGAGCCGTGACGGTCACCCAAGATACCTCTTTGGAGGACGGTACGTTGATCTACAAAGTGAAATTGCGTGGCACGAATTTTCCACCTGATGGGCCTGTCATGCAGAAAAAGACAATGGGATGGGAAGCTTCCACGGAGCGCCTTTACCCAGAGGACGGTGTTCTTAAAGGGGATATCAAAATGGCGCTGCGTCTTAAAGATGGAGGCCGCTACCTGGCGGACTTCAAGACTACTTACAAGGCCAAAAAACCAGTGCAGATGCCGGGTGCGTACAATGTAGATCGTAAATTAGATATTACAAGTCACAATGAAGATTACACGGTCGTAGAGCAGTATGAGCGCAGTGAGGGGCGTCACTCTACGGGCGGTATGGACGAGTTATACAAGTAAAGGTGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTTCGACCAAAGGCTCGGTACCAAATTCCAGAAAAGACACCCGAAAGGGTGTTTTTTCGTTTTGGTCCGCTTATGAAGGAGTTGACGGCTAGCTCAGTCCTAGGTACAGTGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAA",
        "CCATAGAGTCACACAGGACTACT",
        "AATGAGTAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCAACATACGGAAAACTTACCCTTAAATTGATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTGGGTTATGGTGTTCAATGCTTTGCGAGATACCCAGATCATATGAAACAGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAAAGAACTATATTTTTCAAAGATGACGGGAACTACAAGACACGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATAGAATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTTGGACACAAATTGGAATACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCGAACTTCAAAATTAGACACAACATTGAAGATGGAGGTGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCCTACCAATCTAAGCTTTCGAAAGATCCCAACGAAAAGAGAGATCACATGGTCCTTCTTGAGTTTGTAACAGCTGCTGGGATTACACATGGCATGGATGAACTATACAAATAAGGTCCGGCTTATCGGTCAGTTTCACCTGATTTACGTAAAAACCCGCTTCGGCGGGTTTTTGCTTTTGGAGGGGCAGAAAGATGAATGACTGTCCACGACGCTATACCCAAAAGAAAAAAAAAAAACCCCGCCCCTGACAGGGCGGGGTTTTTTTTGCTTA",
    ),
    "pLIB432_B0034m": (
        "GCACTGAAGGTCCTCAATCGCACTGGAAACATCAAGGTCGACGAGGAGTTTACGGCTAGCTCAGTCCTAGGTATAGTGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAACCATAGAGAAAGAGGAGAAATACTAATGGTCAGTAAAGGCGAAGCAGTTATCAAAGAGTTCATGCGCTTCAAAGTTCATATGGAAGGGTCGATGAACGGGCACGAATTTGAAATTGAAGGCGAAGGCGAAGGCCGCCCATATGAAGGGACCCAAACCGCAAAGCTTAAGGTTACTAAAGGCGGTCCATTACCCTTTTCGTGGGACATTTTAAGCCCACAGTTTATGTACGGGAGTCGCGCTTTCATCAAGCACCCTGCGGACATCCCAGATTACTACAAACAGTCTTTCCCCGAGGGGTTCAAGTGGGAGCGCGTGATGAACTTCGAGGATGGCGGAGCCGTGACGGTCACCCAAGATACCTCTTTGGAGGACGGTACGTTGATCTACAAAGTGAAATTGCGTGGCACGAATTTTCCACCTGATGGGCCTGTCATGCAGAAAAAGACAATGGGATGGGAAGCTTCCACGGAGCGCCTTTACCCAGAGGACGGTGTTCTTAAAGGGGATATCAAAATGGCGCTGCGTCTTAAAGATGGAGGCCGCTACCTGGCGGACTTCAAGACTACTTACAAGGCCAAAAAACCAGTGCAGATGCCGGGTGCGTACAATGTAGATCGTAAATTAGATATTACAAGTCACAATGAAGATTACACGGTCGTAGAGCAGTATGAGCGCAGTGAGGGGCGTCACTCTACGGGCGGTATGGACGAGTTATACAAGTAAAGGTGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTTCGACCAAAGGCTCGGTACCAAATTCCAGAAAAGACACCCGAAAGGGTGTTTTTTCGTTTTGGTCCGCTTATGAAGGAGTTGACGGCTAGCTCAGTCCTAGGTACAGTGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAA",
        "CCATAGAGAAAGAGGAGAAATACT",
        "AATGAGTAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCAACATACGGAAAACTTACCCTTAAATTGATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTGGGTTATGGTGTTCAATGCTTTGCGAGATACCCAGATCATATGAAACAGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAAAGAACTATATTTTTCAAAGATGACGGGAACTACAAGACACGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATAGAATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTTGGACACAAATTGGAATACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCGAACTTCAAAATTAGACACAACATTGAAGATGGAGGTGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCCTACCAATCTAAGCTTTCGAAAGATCCCAACGAAAAGAGAGATCACATGGTCCTTCTTGAGTTTGTAACAGCTGCTGGGATTACACATGGCATGGATGAACTATACAAATAAGGTCCGGCTTATCGGTCAGTTTCACCTGATTTACGTAAAAACCCGCTTCGGCGGGTTTTTGCTTTTGGAGGGGCAGAAAGATGAATGACTGTCCACGACGCTATACCCAAAAGAAAAAAAAAAAACCCCGCCCCTGACAGGGCGGGGTTTTTTTTGCTTA",
    ),
    "pLIB433": (
        "GCACTGAAGGTCCTCAATCGCACTGGAAACATCAAGGTCGACGAGGAGTTTACGGCTAGCTCAGTCCTAGGTATAGTGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAACCATAGAGAAAGAGGAGAAATACTAATGGTCAGTAAAGGCGAAGCAGTTATCAAAGAGTTCATGCGCTTCAAAGTTCATATGGAAGGGTCGATGAACGGGCACGAATTTGAAATTGAAGGCGAAGGCGAAGGCCGCCCATATGAAGGGACCCAAACCGCAAAGCTTAAGGTTACTAAAGGCGGTCCATTACCCTTTTCGTGGGACATTTTAAGCCCACAGTTTATGTACGGGAGTCGCGCTTTCATCAAGCACCCTGCGGACATCCCAGATTACTACAAACAGTCTTTCCCCGAGGGGTTCAAGTGGGAGCGCGTGATGAACTTCGAGGATGGCGGAGCCGTGACGGTCACCCAAGATACCTCTTTGGAGGACGGTACGTTGATCTACAAAGTGAAATTGCGTGGCACGAATTTTCCACCTGATGGGCCTGTCATGCAGAAAAAGACAATGGGATGGGAAGCTTCCACGGAGCGCCTTTACCCAGAGGACGGTGTTCTTAAAGGGGATATCAAAATGGCGCTGCGTCTTAAAGATGGAGGCCGCTACCTGGCGGACTTCAAGACTACTTACAAGGCCAAAAAACCAGTGCAGATGCCGGGTGCGTACAATGTAGATCGTAAATTAGATATTACAAGTCACAATGAAGATTACACGGTCGTAGAGCAGTATGAGCGCAGTGAGGGGCGTCACTCTACGGGCGGTATGGACGAGTTATACAAGTAAAGGTGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTTCGACCAAAGGCTCGGTACCAAATTCCAGAAAAGACACCCGAAAGGGTGTTTTTTCGTTTTGGTCCGCTTATGAAGGAGTTGACGGCTAGCTCAGTCCTAGGTACAGTGCTAGCTACTAGCTGTCACCGGATGTGCTTTCCGGTCTGATGAGTCCGTGAGGACGAAACAGCCTCTACAAATAATTTTGTTTAAGGGCCCAAGTTCACTTAAAAAGGAGATCAACAATGAAAGCAATTTTCGTACTGAAACATCTTAATCATGC",
        "NNNNNNNNN",
        "TTAAAAATGGCACGTACCCCGAGCCGTAGCAGCATTGGTAGCCTGCGTAGTCCGCATACCCATAAAGCAATTCTGACCAGCACCATTGAAATCCTGAAAGAATGTGGTTATAGCGGTCTGAGCATTGAAAGCGTGGCACGTCGCGCCGGTGCAGGCAAACCGACCATTTATCGTTGGTGGACCAACAAAGCAGCACTGATTGCCGAAGTGTATGAAAATGAAATCGAACAGGTACGTAAATTTCCGGATTTGGGTAGCTTTAAAGCCGATCTGGATTTTCTGCTGCATAATCTGTGGAAAGTTTGGCGTGAAACCATTTGTGGTGAAGCATTTCGTTGTGTTATTGCAGAAGCACAGTTGGACCCTGTAACCCTGACCCAACTGAAAGATCAGTTTATGGAACGTCGTCGTGAGATACCGAAAAAACTGGTTGAAGATGCCATTAGCAATGGTGAACTGCCGAAAGATATCAATCGTGAACTGCTGCTGGATATGATTTTTGGTTTTTGTTGGTATCGCCTGCTGACCGAACAGTTGACCGTTGAACAGGATATTGAAGAATTTACCTTCCTGCTGATTAATGGTGTTTGTCCGGGTACACAGTGTTAATAAGGTCCGGCAATTAAAAAAGCGGCTAACCACGCCGCTTTTTTTACGTCTGCACTCGGTACCAAATTCCAGAAAAGAGGCCTCCCGAAAGGGGGGCCTTTTTTCGTTTTGGTCCGCTTAAATAGGAGCGACGTACGGTGGAATCTGATTCGTTACCAATTGACATGATACGAAACGTACCGTATCGTTAAGGTTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAACCATAGAGTCACACAGGAAAGTACTAATGAGTAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCAACATACGGAAAACTTACCCTTAAATTGATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTGGGTTATGGTGTTCAATGCTTTGCGAGATACCCAGATCATATGAAACAGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAAAGAACTATATTTTTCAAAGATGACGGGAACTACAAGACACGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATAGAATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTTGGACACAAATTGGAATACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCGAACTTCAAAATTAGACACAACATTGAAGATGGAGGTGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCCTACCAATCTAAGCTTTCGAAAGATCCCAACGAAAAGAGAGATCACATGGTCCTTCTTGAGTTTGTAACAGCTGCTGGGATTACACATGGCATGGATGAACTATACAAATAAGGTCCGGCTTATCGGTCAGTTTCACCTGATTTACGTAAAAACCCGCTTCGGCGGGTTTTTGCTTTTGGAGGGGCAGAAAGATGAATGACTGTCCACGACGCTATACCCAAAAGAAAAAAAAAAAACCCCGCCCCTGACAGGGCGGGGTTTTTTTTGCTTA",
    ),
    "pLIB434": (
        "GCACTGAAGGTCCTCAATCGCACTGGAAACATCAAGGTCGACGAGGAGTTTACGGCTAGCTCAGTCCTAGGTATAGTGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAACCATAGAGAAAGAGGAGAAATACTAATGGTCAGTAAAGGCGAAGCAGTTATCAAAGAGTTCATGCGCTTCAAAGTTCATATGGAAGGGTCGATGAACGGGCACGAATTTGAAATTGAAGGCGAAGGCGAAGGCCGCCCATATGAAGGGACCCAAACCGCAAAGCTTAAGGTTACTAAAGGCGGTCCATTACCCTTTTCGTGGGACATTTTAAGCCCACAGTTTATGTACGGGAGTCGCGCTTTCATCAAGCACCCTGCGGACATCCCAGATTACTACAAACAGTCTTTCCCCGAGGGGTTCAAGTGGGAGCGCGTGATGAACTTCGAGGATGGCGGAGCCGTGACGGTCACCCAAGATACCTCTTTGGAGGACGGTACGTTGATCTACAAAGTGAAATTGCGTGGCACGAATTTTCCACCTGATGGGCCTGTCATGCAGAAAAAGACAATGGGATGGGAAGCTTCCACGGAGCGCCTTTACCCAGAGGACGGTGTTCTTAAAGGGGATATCAAAATGGCGCTGCGTCTTAAAGATGGAGGCCGCTACCTGGCGGACTTCAAGACTACTTACAAGGCCAAAAAACCAGTGCAGATGCCGGGTGCGTACAATGTAGATCGTAAATTAGATATTACAAGTCACAATGAAGATTACACGGTCGTAGAGCAGTATGAGCGCAGTGAGGGGCGTCACTCTACGGGCGGTATGGACGAGTTATACAAGTAAAGGTGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTTCGACCAAAGGCTCGGTACCAAATTCCAGAAAAGACACCCGAAAGGGTGTTTTTTCGTTTTGGTCCGCTTATGAAGGAGTTGACGGCTAGCTCAGTCCTAGGTACAGTGCTAGCTACTAGCTGTCACCGGATGTGCTTTCCGGTCTGATGAGTCCGTGAGGACGAAACAGCCTCTACAAATAATTTTGTTTAAGGGCCCAAGTTCACTTAAAAAGGAGATCAACAATGAAAGCAATTTTCGTACTGAAACATCTTAATCATGC",
        "NNNNNNNNN",
        "TTAAAAATGAAACCAGTAACGTTATACGATGTCGCAGAGTATGCCGGTGTCTCTTATATGACCGTTTCCCGCGTGGTGAACCAGGCCAGCCACGTTTCTGCGAAAACGCGGGAAAAAGTGGAAGCGGCGATGGTGGAGCTGAATTACATTCCCAACCGCGTGGCACAACAACTGGCGGGCAAACAGTCGTTGCTGATTGGCGTTGCCACCTCCAGTCTGGCCCTGCACGCGCCGTCGCAAATTGTCGCGGCGATTAAATCTCGCGCCGATCAACTGGGTGCCAGCGTGGTGGTGTCGATGGTAGAACGAAGCGGCGTCGAAGCCTGTAAAGCGGCGGTGCACAATCTTCTCGCGCAACGCGTCAGTGGGCTGATCATTAACTATCCGCTGGATGACCAGGATGCCATTGCTGTGGAAGCTGCCTGCACTAATGTTCCGGCGTTATTTCTTGATGTCTCTGACCAGACACCCATCAACAGTATTATTTACTCCCATGAGGACGGTACGCGACTGGGCGTGGAGCATCTGGTCGCATTGGGTCACCAGCAAATCGCGCTGTTAGCGGGCCCATTAAGTTCTGTCTCGGCGCGTCTGCGTCTGGCTGGCTGGCATAAATATCTCACTCGCAATCAAATTCAGCCGATAGCGGAACGGGAAGGCGACTGGAGTGCCATGTCCGGTTTTCAACAAACCATGCAAATGCTGAATGAGGGCATCGTTCCCACTGCGATGCTGGTTGCCAACGATCAGATGGCGCTGGGCGCAATGCGCGCCATTACCGAGTCCGGGCTGCGCGTTGGTGCGGATATCTCGGTAGTGGGATACGACGATACCGAAGATAGCTCATGTTATATCCCGCCGTTAACCACCATCAAACAGGATTTTCGCCTGCTGGGGCAAACCAGCGTGGACCGCTTGCTGCAACTCTCTCAGGGCCAGGCGGTGAAGGGCAATCAGCTGTTGCCAGTCTCACTGGTGAAAAGAAAAACCACCCTGGCGCCCAATACGCAAACCGCCTCTCCCCGCGCGTTGGCCGATTCATTAATGCAGCTGGCACGACAGGTTTCCCGACTGGAAAGCGGGCAGTAATAAGGTCCGGCAATTAAAAAAGCGGCTAACCACGCCGCTTTTTTTACGTCTGCACTCGGTACCAAATTCCAGAAAAGAGGCCTCCCGAAAGGGGGGCCTTTTTTCGTTTTGGTCCGCTTAAATAGGAGTGTTGACAATTAATCATCGGCTCGTATAATGTGTGGAATTGTGAGCGCTCACAATTTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAACCATAGAGTCACACAGGAAAGTACTAATGAGTAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCAACATACGGAAAACTTACCCTTAAATTGATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTGGGTTATGGTGTTCAATGCTTTGCGAGATACCCAGATCATATGAAACAGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAAAGAACTATATTTTTCAAAGATGACGGGAACTACAAGACACGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATAGAATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTTGGACACAAATTGGAATACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCGAACTTCAAAATTAGACACAACATTGAAGATGGAGGTGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCCTACCAATCTAAGCTTTCGAAAGATCCCAACGAAAAGAGAGATCACATGGTCCTTCTTGAGTTTGTAACAGCTGCTGGGATTACACATGGCATGGATGAACTATACAAATAAGGTCCGGCTTATCGGTCAGTTTCACCTGATTTACGTAAAAACCCGCTTCGGCGGGTTTTTGCTTTTGGAGGGGCAGAAAGATGAATGACTGTCCACGACGCTATACCCAAAAGAAAAAAAAAAAACCCCGCCCCTGACAGGGCGGGGTTTTTTTTGCTTA",
    ),
    "pLIB435": (
        "GCACTGAAGGTCCTCAATCGCACTGGAAACATCAAGGTCGACGAGGAGTTTACGGCTAGCTCAGTCCTAGGTATAGTGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAACCATAGAGAAAGAGGAGAAATACTAATGGTCAGTAAAGGCGAAGCAGTTATCAAAGAGTTCATGCGCTTCAAAGTTCATATGGAAGGGTCGATGAACGGGCACGAATTTGAAATTGAAGGCGAAGGCGAAGGCCGCCCATATGAAGGGACCCAAACCGCAAAGCTTAAGGTTACTAAAGGCGGTCCATTACCCTTTTCGTGGGACATTTTAAGCCCACAGTTTATGTACGGGAGTCGCGCTTTCATCAAGCACCCTGCGGACATCCCAGATTACTACAAACAGTCTTTCCCCGAGGGGTTCAAGTGGGAGCGCGTGATGAACTTCGAGGATGGCGGAGCCGTGACGGTCACCCAAGATACCTCTTTGGAGGACGGTACGTTGATCTACAAAGTGAAATTGCGTGGCACGAATTTTCCACCTGATGGGCCTGTCATGCAGAAAAAGACAATGGGATGGGAAGCTTCCACGGAGCGCCTTTACCCAGAGGACGGTGTTCTTAAAGGGGATATCAAAATGGCGCTGCGTCTTAAAGATGGAGGCCGCTACCTGGCGGACTTCAAGACTACTTACAAGGCCAAAAAACCAGTGCAGATGCCGGGTGCGTACAATGTAGATCGTAAATTAGATATTACAAGTCACAATGAAGATTACACGGTCGTAGAGCAGTATGAGCGCAGTGAGGGGCGTCACTCTACGGGCGGTATGGACGAGTTATACAAGTAAAGGTGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGCTTTTTTTTTCGACCAAAGGCTCGGTACCAAATTCCAGAAAAGACACCCGAAAGGGTGTTTTTTCGTTTTGGTCCGCTTATGAAGGAGTTGACGGCTAGCTCAGTCCTAGGTACAGTGCTAGCTACTAGCTGTCACCGGATGTGCTTTCCGGTCTGATGAGTCCGTGAGGACGAAACAGCCTCTACAAATAATTTTGTTTAAGGGCCCAAGTTCACTTAAAAAGGAGATCAACAATGAAAGCAATTTTCGTACTGAAACATCTTAATCATGC",
        "NNNNNNNNN",
        "TTAAAAATGCCGAAACTGGGTATGCAGAGCATTCGTCGTCGTCAGCTGATTGATGCAACCCTGGAAGCAATTAATGAAGTTGGTATGCATGATGCAACCATTGCACAGATTGCACGTCGTGCCGGTGTTAGCACCGGTATTATTAGCCATTATTTCCGCGATAAAAACGGTCTACTGGAAGCAACCATGCGTGATATTACCAGCCAGCTGCGTGATGCAGTTCTGAATCGTCTGCATGCACTGCCGCAGGGTAGCGCAGAACAGCGTCTGCAGGCAATTGTTGGTGGTAATTTTGATGAAACCCAGGTTAGCAGCGCAGCAATGAAAGCATGGCTGGCATTTTGGGCAATCAGCATGCATCAGCCGATGCTGTATCGTCTGCAGCAGGTTAGCAGTCGTCGTCTGCTGAGCAATCTGGTTAGCGAATTTCGTCGTGAACTGCCTCGTGAACAGGCACAAGAGGCAGGTTATGGTCTGGCAGCACTGATTGATGGTCTGTGGCTGCGTGCAGCACTGAGCGGTAAACCGCTGGATAAAACCCGTGCAAATAGCCTGACCCGTCATTTTATCACCCAGCATCTGCCGACCGATTAATAAGGTCCGGCAATTAAAAAAGCGGCTAACCACGCCGCTTTTTTTACGTCTGCACTCGGTACCAAATTCCAGAAAAGAGGCCTCCCGAAAGGGGGGCCTTTTTTCGTTTTGGTCCGCTTAAATAGGAGAGCGCGGGTGAGAGGGATTCGTTACCAATAGACAATTGATTGGACGTTCAATATAATGCTAGCTACTAGCGCTGTCTGTACTTGTATCAGTACACTGACGAGTCCCTAAAGGACGAAACACCGCCTCTACAAATAATTTTGTTTAACCATAGAGTCACACAGGAAAGTACTAATGAGTAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCAACATACGGAAAACTTACCCTTAAATTGATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTGGGTTATGGTGTTCAATGCTTTGCGAGATACCCAGATCATATGAAACAGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAAAGAACTATATTTTTCAAAGATGACGGGAACTACAAGACACGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATAGAATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTTGGACACAAATTGGAATACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCGAACTTCAAAATTAGACACAACATTGAAGATGGAGGTGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCCTACCAATCTAAGCTTTCGAAAGATCCCAACGAAAAGAGAGATCACATGGTCCTTCTTGAGTTTGTAACAGCTGCTGGGATTACACATGGCATGGATGAACTATACAAATAAGGTCCGGCTTATCGGTCAGTTTCACCTGATTTACGTAAAAACCCGCTTCGGCGGGTTTTTGCTTTTGGAGGGGCAGAAAGATGAATGACTGTCCACGACGCTATACCCAAAAGAAAAAAAAAAAACCCCGCCCCTGACAGGGCGGGGTTTTTTTTGCTTA",
    ),
}

In [None]:
df4.select(
    pl.col("depth", "simplex_depth", "duplex_depth"),
    pl.col("consensus_seq").str.len_bytes(),
).head(10)

In [None]:
print(seq)

In [None]:
parasail_kwargs = {
    "parasail_algorithm": "sw",
    "degenerate": True,
    "match": 1,
    "mismatch": -1,
    "gap_opening": 3,
    "gap_extension": 1,
}

In [None]:
idx = 308
df_idx = df2_10
seq = df_idx[idx, "consensus_seq"]
path = df_idx[idx, "variants_path"]

In [None]:
%%time
alignments = {}
for name, segments in tqdm(refs.items()):
    ref_seq = reverse_complement("".join(segments))
    score, cigar = align.pairwise_align(seq, ref_seq, **parasail_kwargs)
    score -= len(ref_seq)
    alignments[name] = (score, cigar)

In [None]:
alignments

In [None]:
df_idx[idx]

In [None]:
# 2001
alignments

In [None]:
df2[2004]

In [None]:
df2_long = df2.filter(pl.col("query_length") > 3200)

In [None]:
df2_5 = df2.filter(pl.col("depth") > 5)
df2_10 = df2.filter(pl.col("depth") > 10)
(len(df2_5), len(df2_10))

In [None]:
df2_long_5 = df2_long.filter(pl.col("depth") > 5)
df2_long_10 = df2_long.filter(pl.col("depth") > 10)

In [None]:
plt.hist(df2["query_length"], bins=100);

In [None]:
plt.hist(df2_long_10["query_length"], bins=100);

In [None]:
len(df2)

In [None]:
len(df2_long)

In [None]:
len(df2_long_5)

In [None]:
len(df2_long_10)

In [None]:
{name: len("".join(segments)) for name, segments in refs.items()}

In [None]:
%%time
df_idx = df2_5
freqs = Counter()
idxs = defaultdict(list)
cigars = defaultdict(list)
num = len(df_idx)
# num = 5000
for idx in trange(num):
    seq = df_idx[idx, "consensus_seq"]
    path = df_idx[idx, "variants_path"]
    alignments = {}
    for name, segments in refs.items():
        ref_seq = reverse_complement("".join(segments))
        score, cigar = align.pairwise_align(seq, ref_seq, **parasail_kwargs)
        score -= len(ref_seq)
        alignments[name] = (score, cigar)
    sorted_alignments = sorted(alignments.items(), key=lambda x: x[1][0], reverse=True)
    best_name = sorted_alignments[0][0]
    best_cigar = sorted_alignments[0][1]
    freqs[best_name] += 1
    idxs[best_name].append(idx)
    cigars[best_name].append(best_cigar)

In [None]:
freqs

In [None]:
freqs

In [None]:
cigars