In [None]:
import itertools as it
import operator
import re
from collections import Counter, defaultdict

import gfapy
import holoviews as hv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
from pyarrow import csv
from tqdm.auto import tqdm, trange

In [None]:
hv.extension("bokeh")

In [None]:
!micromamba list|grep protobuf

In [None]:
!micromamba install -y protobuf=4.21.7 async_generator pyarrow

In [None]:
!pip install --no-deps pystream-protobuf

# GAF

In [None]:
import re
from collections import Counter

In [None]:
def segment_frequences(table, segment_names):
    rows = []
    for path in table.column("path"):
        read_segments = re.split(r">|<", str(path))
        segment_counts = Counter(read_segments)
        rows.append([segment_counts[s] for s in segment_names])
    return pd.DataFrame(rows, columns=segment_names, index=table.column("name"))

In [None]:
gfa = gfapy.Gfa.from_file("nao745bc.gfa")

In [None]:
!du -hs *.gaf

In [None]:
# gaf_filename = "duplex_hac1_subsample_dbg.gaf"
# gaf_filename = "duplex_hac1_subsample_dbg2.gaf"
# gaf_filename = "duplex_hac1_subsample_vg2.gaf"
gaf_filename = "duplex_sup1_vg2.gaf"

In [None]:
%%time
# SEE: http://samtools.github.io/hts-specs/SAMv1.pdf
# and https://samtools.github.io/hts-specs/SAMtags.pdf
SAM_TAG_TYPES = {
    "A": pa.dictionary(pa.int32(), pa.string()),
    "f": pa.float32(),
    "i": pa.int32(),
    "Z": pa.string(),
}
with open(gaf_filename, "r") as f:
    first = f.readline().split("\t")
tags = first[12:]
tag_column_types = {(t := tag.split(":"))[0]: SAM_TAG_TYPES[t[1]] for tag in tags}
column_types = {
    "name": pa.string(),
    "query_length": pa.uint64(),
    "query_start": pa.uint64(),
    "query_end": pa.uint64(),
    "strand": pa.dictionary(pa.int32(), pa.string()),
    "path": pa.string(),
    "path_length": pa.uint64(),
    "path_start": pa.uint64(),
    "path_end": pa.uint64(),
    "residue_matches": pa.uint64(),
    "block_length": pa.uint64(),
    "mapping_quality": pa.uint8(),
    **{tag: pa.string() for tag in tag_column_types.keys()},
}
read_options = csv.ReadOptions(column_names=column_types.keys())
parse_options = csv.ParseOptions(delimiter="\t")
convert_options = csv.ConvertOptions(column_types=column_types)
with csv.open_csv(
    gaf_filename,
    read_options=read_options,
    parse_options=parse_options,
    convert_options=convert_options,
) as f:
    # tt = f.read_next_batch()
    segment_counts = Counter()
    barcode_counts = Counter()
    total = 0
    duplex = 0
    singleton = 0
    filtered = 0
    # while True:
    # for _ in tqdm(it.islice(it.count(), 10)):
    for _ in tqdm(it.count()):
        try:
            table = f.read_next_batch()
        except StopIteration:
            break
        # break
        freqs = segment_frequences(table, gfa.segment_names)
        duplex += freqs.index.str.contains(";").sum()
        # filtered_freqs = freqs[(freqs.max(axis=1) == 1) & ((freqs["BIT0OFF"] == 1) | (freqs["BIT0ON"] == 1)) & (freqs["pPhlF"] == 1)]
        filtered_freqs = freqs[
            freqs.index.str.contains(";")
            & (freqs.max(axis=1) == 1)
            & ((freqs["BIT0OFF"] == 1) | (freqs["BIT0ON"] == 1))
            & (freqs["RBS1"] == 1)
        ]
        # filtered_freqs = freqs
        filtered += len(filtered_freqs)
        total += len(freqs)
        segment_counts.update(list(filtered_freqs.itertuples(index=False)))
        barcode_counts.update(
            list(
                filtered_freqs.loc[
                    :, filtered_freqs.columns.str.startswith("BIT")
                ].itertuples(index=False)
            )
        )

In [None]:
(filtered, total, filtered / total, duplex, duplex / total)

In [None]:
def bit_sums(freqs):
    return pd.DataFrame(
        {f"BIT{i}": freqs[f"BIT{i}ON"] + freqs[f"BIT{i}OFF"] for i in range(30)}
    )

In [None]:
filtered_freqs.mean(axis=0)

In [None]:
filtered_freqs.loc[:, filtered_freqs.columns.str.endswith("ON")].mean(axis=0).plot.bar()

In [None]:
filtered_freqs.mean(axis=0).plot.bar()

In [None]:
segment_counts.most_common(3)

In [None]:
barcode_counts.most_common(3)

In [None]:
plt.hist(barcode_counts.values(), bins=100, log=True);

In [None]:
n, bins, patches = plt.hist(
    barcode_counts.values(),
    100,
    histtype="step",
    density=False,
    cumulative=-1,
    log=True,
)

In [None]:
n, bins, patches = plt.hist(
    barcode_counts.values(),
    100,
    range=(0, 10),
    histtype="step",
    density=False,
    cumulative=-1,
    log=True,
)

In [None]:
len(barcode_counts)

In [None]:
sum(1 for v in barcode_counts.values() if v == 1)

In [None]:
sum(1 for v in barcode_counts.values() if v == 2)

In [None]:
sum(1 for v in barcode_counts.values() if v == 3)

In [None]:
sum(1 for v in barcode_counts.values() if v == 4)

In [None]:
sum(1 for v in barcode_counts.values() if v >= 5)

In [None]:
sum(1 for v in barcode_counts.values() if 5 <= v < 20)

In [None]:
sum(1 for v in barcode_counts.values() if 20 <= v < 100)

In [None]:
sum(1 for v in barcode_counts.values() if v >= 100)

In [None]:
sum(1 for v in barcode_counts.values() if v >= 100)

In [None]:
sum(1 for v in barcode_counts.values() if v >= 2)

In [None]:
max(barcode_counts.values())

In [None]:
sum(v for v in barcode_counts.values() if v == 1) / total

In [None]:
sum(1 for v in barcode_counts.values() if v >= 2)

In [None]:
sum(v for v in barcode_counts.values() if v >= 10)

In [None]:
sum(v for v in barcode_counts.values() if v >= 10) / filtered

In [None]:
filtered

In [None]:
sum(v for v in barcode_counts.values())

In [None]:
n, bins, patches = plt.hist(
    barcode_counts.values(),
    100,
    range=(0, 20),
    histtype="step",
    density=True,
    cumulative=-1,
    log=True,
)

In [None]:
gfa.try_get_segment("BIT1OFF")

In [None]:
import uuid

In [None]:
u = uuid.UUID("6e507a8a-c271-4561-8768-0f9bf9d4c301")

In [None]:
import sys

In [None]:
sys.getsizeof(u.int)

In [None]:
sys.getsizeof("6e507a8a-c271-4561-8768-0f9bf9d4c301")

In [None]:
u.int

In [None]:
t["cg"][0]

In [None]:
t["name"].str.split(";")

# GAM

In [None]:
# 1) segment_cigars df (segment coördinates, normalize orientation) [numba]
# 2) segment_mismatches df (cellwise apply, get insertions/deletions/mismatches/equal)
# 3) filter on barcode mismatches (?)
# 4) group segment_cigars by barcode, run cigar_aggregation on non-barcode (or all!) segments
# 5)

In [None]:
import os
import sys
from pathlib import Path

sys.path.append(str(Path(os.environ["src"]) / "sequencing"))

import stream
import vg_pb2

In [None]:
temp_dir = Path("/home/jqs1/scratch/jqs1/sequencing/230726_carlos/no_sample/temp/")

In [None]:
gfa_filename = temp_dir / "barcode_daniel.gfa"
gfa = gfapy.Gfa.from_file(gfa_filename)
segments = {s.name: s.sequence for s in gfa.segments}

In [None]:
# gam_filename = "duplex_sup1_subsample_vg2.gam"
# gam_filename = "reverse_test_duplex.gam"
# gam_filename = "reverse_test_simplex.gam"
# gam_filename = "reverse_test_duplex_t1.gam"
gam_filename = temp_dir / "duplex_sup1.gam"

In [None]:
from enum import Enum
from typing import NamedTuple


class Op(Enum):
    INSERTION = 1
    DELETION = 2
    SUBSTITUTION = 3


class Edit(NamedTuple):
    op: Op
    index: int
    seq: str
    length: int

    def __repr__(self):
        match self.op:
            case Op.INSERTION:
                return f"I:{self.index}:{self.seq or ''}"
            case Op.DELETION:
                return f"D:{self.index}:{self.length}"
            case Op.SUBSTITUTION:
                return f"S:{self.index}:{self.seq or ''}"
            case _:
                return f"{self.op}:{self.index}:{self.seq or self.length or ''}"

    __str__ = __repr__

In [None]:
from Bio.Seq import Seq


def local_index(index, length, is_reverse):
    if is_reverse:
        return length - index
    else:
        return index


def reverse_complement(seq):
    return str(Seq(seq).reverse_complement())


def reversed_seq(seq, is_reverse):
    if is_reverse:
        return reverse_complement(seq)
    else:
        return seq


def extract_read_segments(msg):
    segment_seqs = {}
    # segment_read_indices = {}
    read_index = 0
    for mapping in msg.path.mapping:
        segment_name = mapping.position.name
        offset = mapping.position.offset
        segment_read_start = read_index
        is_reverse = mapping.position.is_reverse
        segment_length = len(segments[segment_name])
        for edit in mapping.edit:
            if edit.from_length == edit.to_length:
                # snp/match
                read_index += edit.to_length
            elif not edit.to_length:
                # deletion
                continue
            elif edit.from_length < edit.to_length:
                # insertion
                # if from_length > 0: need to remove matching bases from both sides
                assert edit.from_length == 0
                read_index += edit.to_length
        segment_read_end = read_index
        seq = msg.sequence[segment_read_start:segment_read_end]
        if is_reverse:
            seq = reverse_complement(seq)
        segment_seqs[segment_name] = seq
        # segment_read_indices[segment_name] = (segment_read_start, segment_read_end)
    return segment_seqs

# Raw reads

In [None]:
segment_names = list(segments.keys())

In [None]:
segment_names

In [None]:
msgs = list(it.islice(stream.parse(str(gam_filename), vg_pb2.Alignment), 100))

In [None]:
msgs[0]

In [None]:
starts = Counter()
ends = Counter()
counts = Counter()
for msg in tqdm(it.islice(stream.parse(str(gam_filename), vg_pb2.Alignment), 100_000)):
    endpoints = (msg.path.mapping[0].position.name, msg.path.mapping[-1].position.name)
    if segment_names.index(endpoints[0]) >= segment_names.index(endpoints[1]):
        endpoints = endpoints[::-1]
    starts[endpoints[0]] += 1
    ends[endpoints[1]] += 1
    for mapping in msg.path.mapping:
        counts[mapping.position.name] += 1

In [None]:
sorted(starts.items(), key=operator.itemgetter(1), reverse=True)

In [None]:
sorted(ends.items(), key=operator.itemgetter(1), reverse=True)

In [None]:
sorted(counts.items(), key=operator.itemgetter(1), reverse=True)

# Group by barcode

In [None]:
# gam_filename = "duplex_sup1_subsample_vg2.gam"
# gam_filename = "duplex_sup1_vg2.gam"

In [None]:
%%time
barcode_msgs = defaultdict(list)
# for msg in tqdm(it.islice(stream.parse(gam_filename, vg_pb2.Alignment), 100_000)):
for msg in tqdm(
    it.islice(stream.parse(str(gam_filename), vg_pb2.Alignment), 10_000_000)
):
    # for msg in tqdm(stream.parse(gam_filename, vg_pb2.Alignment)):
    path = set([m.position.name for m in msg.path.mapping])
    if not (("BIT0ON" in path or "BIT0OFF" in path) and "SPACER2" in path):
        continue
    barcode = tuple(1 if f"BIT{i}ON" in path else 0 for i in range(30))
    barcode_msgs[barcode].append(msg)

In [None]:
%%time
threshold = 3
culling_interval = 100_000
reps = 30
barcode_msgs = defaultdict(list)

reader = stream.parse(gam_filename, vg_pb2.Alignment)
for rep in trange(reps):
    try:
        for msg in tqdm(it.islice(reader, culling_interval)):
            path = set([m.position.name for m in msg.path.mapping])
            if not (("BIT0ON" in path or "BIT0OFF" in path) and "SPACER2" in path):
                continue
            barcode = tuple(1 if f"BIT{i}ON" in path else 0 for i in range(30))
            barcode_msgs[barcode].append(msg)
    except StopIteration:
        print("DONE!")
        pass
    remove_keys = []
    for k, v in barcode_msgs.items():
        if len(v) < threshold:
            remove_keys.append(k)
    for k in remove_keys:
        del barcode_msgs[k]

In [None]:
len(barcode_msgs)

In [None]:
list(sorted(Counter(len(v) for k, v in barcode_msgs.items()).items()))

In [None]:
barcode_msgs_cluster = defaultdict(list)
for k, v in barcode_msgs.items():
    barcode_msgs_cluster[len(v)].append(v)

In [None]:
msgs = barcode_msgs_cluster[109][1]

In [None]:
del barcode_msgs_cluster, barcode_msgs

# Copy number distribution

In [None]:
def count(values):
    return sorted(Counter(values).items())


def hist(ary, normalize=False):
    ary = np.array(ary)
    bins = ary[:, 0]
    pmf = ary[:, 1]
    if normalize:
        pmf = pmf / pmf.sum()
    rcmf = np.cumsum(pmf[::-1])[::-1]
    pmf_bins = np.arange(bins.min(), bins.max() + 1)
    pmf_full = np.zeros(len(pmf_bins))
    pmf_dict = {bins[idx]: pmf[idx] for idx in range(len(bins))}
    for idx in range(len(pmf_full)):
        if (bin := pmf_bins[idx]) in pmf_dict:
            pmf_full[idx] = pmf_dict[bin]
    return hv.Curve((pmf_bins, pmf_full)).options(
        interpolation="steps-mid", logy=True
    ) * hv.Curve((bins, rcmf)).options(interpolation="steps-pre", logy=True)

In [None]:
len(barcode_msgs)

In [None]:
sum(len(msgs) for bc, msgs in barcode_msgs.items())

In [None]:
num_reads_by_depth = sorted(
    {
        num_reads: len(barcodes) * num_reads
        for num_reads, barcodes in barcode_msgs_cluster.items()
    }.items()
)

In [None]:
num_reads_by_depth_duplex = sorted(
    {
        num_reads: sum(sum(";" in msg.name for msg in msgs) for msgs in barcodes)
        for num_reads, barcodes in barcode_msgs_cluster.items()
    }.items()
)

In [None]:
hist(num_reads_by_depth, normalize=False).redim(
    x="reads_per_barcode", y="total_reads"
).opts(width=800, height=600)

In [None]:
sum(len(x) == 1 for x in barcode_msgs.values())

In [None]:
%%time
sum(sum(";" in msg.name for msg in x) >= 1 for x in barcode_msgs.values())

In [None]:
%%time
sum(sum(";" in msg.name for msg in x) >= 2 for x in barcode_msgs.values())

In [None]:
count_dist = count(len(v) for v in barcode_msgs.values())

In [None]:
count_dist_duplex = count(
    sum(";" in msg.name for msg in v) for v in barcode_msgs.values()
)

In [None]:
hist(count_dist, normalize=True).redim(x="reads_per_barcode", y="counts").opts(
    width=800, height=600
)

In [None]:
count_dist_duplex

In [None]:
hist(count_dist_duplex).opts(width=800, height=600)

In [None]:
counts = np.array(sorted(count_dist2.items()))
bins = counts[:, 0]

In [None]:
pmf = counts[:, 1] / counts[:, 1].sum()

In [None]:
rcmf = np.cumsum(pmf[::-1])[::-1]

In [None]:
bins

In [None]:
hv.Curve((bins, rcmf)).options(interpolation="steps-pre", logy=True)

In [None]:
pmf

In [None]:
hist(sorted(count_dist.items()))

In [None]:
sorted(count_dist.items())

# Edit distances

In [None]:
import edlib

In [None]:
extract_read_segments(msgs[0])["RBS2"]

In [None]:
extract_read_segments(msgs[1])["RBS2"]

In [None]:
edlib.align("AACATAGAGGGATTATTGG", "AACATAAGGGGCTCCG", task="path")["editDistance"]

In [None]:
import edlib


def align(a, b):
    res = edlib.align(a, b, task="path")
    print("\n".join(edlib.getNiceAlignment(res, a, b).values()))

In [None]:
align("AACATAGAGGGATTATTGG", "AACATAAGGGGCTCCG")

In [None]:
def distance_matrices(msgs, segments):
    Ms = {}
    extracted_segments = [extract_read_segments(msg) for msg in msgs]
    for segment in tqdm(segments.keys()):
        if segment not in extracted_segments[0]:
            continue
        M = np.zeros((len(msgs), len(msgs)), dtype=np.uint16)
        for i in range(len(msgs)):
            s1 = extracted_segments[i][segment]
            for j in range(i):
                s2 = extracted_segments[j][segment]
                M[i, j] = M[j, i] = edlib.align(s1, s2)["editDistance"]
        Ms[segment] = M
    return Ms

In [None]:
import scipy
from scipy.cluster import hierarchy


def sorted_matrix_indices(arr):
    # return arr[0].argsort()
    linkage = hierarchy.linkage(
        scipy.spatial.distance.squareform(arr), method="single", metric="euclidean"
    )
    return hierarchy.dendrogram(linkage, no_plot=True, color_threshold=-np.inf)[
        "leaves"
    ]


def sort_matrix(arr, idxs=None):
    if idxs is None:
        idxs = sorted_matrix_indices(arr)
    return arr[idxs, :][:, idxs]

In [None]:
0

In [None]:
msgs = barcode_msgs_cluster[444][0]

In [None]:
Ms = distance_matrices([msg for msg in msgs if ";" in msg.name], segments)
Ms2 = distance_matrices([msg for msg in msgs], segments)

In [None]:
idxs = sorted_matrix_indices(Ms["BetI"])
plt.imshow(sort_matrix(Ms["BetI"], idxs));

In [None]:
segments.keys()

In [None]:
idxs = sorted_matrix_indices(Ms["RBS1"])

In [None]:
idxs = sorted_matrix_indices(Ms["RBS1"] + Ms["RBS2"] + Ms["RBS3"])

In [None]:
Ms["RBS2"].shape

In [None]:
Ms["RBS2"].mean()

In [None]:
plt.imshow(sort_matrix(Ms["RBS1"], idxs));

In [None]:
plt.imshow(sort_matrix(Ms["RBS2"], idxs));

In [None]:
plt.imshow(sort_matrix(Ms["RBS3"], idxs));

In [None]:
idxs2 = sorted_matrix_indices(Ms2["RBS2"])

In [None]:
idxs2 = sorted_matrix_indices(Ms2["RBS1"] + Ms2["RBS2"] + Ms2["RBS3"])

In [None]:
segments.keys()

In [None]:
idxs2 = sorted_matrix_indices(Ms2["pBetI"] + Ms2["pTac"] + Ms2["pPhlF"] + +Ms2["BetI"])

In [None]:
# TODO: try adding barcode bits?? does that help? [no]

In [None]:
plt.imshow(sort_matrix(Ms2["pBetI"] + Ms2["pTac"] + Ms2["pPhlF"] + Ms2["BetI"], idxs2));

In [None]:
plt.imshow(sort_matrix(Ms2["RBS1"] + Ms2["RBS2"] + Ms2["RBS3"], idxs2));

In [None]:
plt.imshow(sort_matrix(Ms2["RBS1"], idxs2));

In [None]:
plt.imshow(sort_matrix(Ms2["RBS2"], idxs2));

In [None]:
plt.imshow(sort_matrix(Ms2["RBS3"], idxs2));

In [None]:
plt.imshow(sort_matrix(Ms2["pPhlF"], idxs2));

In [None]:
plt.imshow(sort_matrix(Ms2["BetI"], idxs2));

In [None]:
extract_read_segments(msgs[idxs2[0]])["RBS2"]

In [None]:
extract_read_segments(msgs[idxs2[12]])["RBS2"]

In [None]:
edlib.align("AACATAGAAAGGGGGTTCCT", "AACATAGAAGGGGGTCCTCG")["editDistance"]

In [None]:
np.argsort(M[0, :])

In [None]:
Counter([extract_read_segments(msg)["RBS2"] for msg in msgs if ";" in msg.name])

In [None]:
segments["RBS2"]

In [None]:
Counter([extract_read_segments(msg)["RBS1"] for msg in msgs])

In [None]:
len(msgs)

In [None]:
def distance_matrices2(msgs):
    Ms = {}
    extracted_segments = [extract_read_segments(msg) for msg in msgs]
    for x in extracted_segments:
        for k in list(x.keys()):
            x[k + "_start"] = x[k][:20]
            x[k + "_end"] = x[k][-20:]
    for segment in tqdm(extracted_segments[0].keys()):
        if segment not in extracted_segments[0]:
            continue
        M = np.zeros((len(msgs), len(msgs)), dtype=np.uint16)
        for i in range(len(msgs)):
            s1 = extracted_segments[i][segment]
            for j in range(i):
                s2 = extracted_segments[j][segment]
                M[i, j] = M[j, i] = edlib.align(s1, s2)["editDistance"]
        Ms[segment] = M
    return Ms

In [None]:
Mss = distance_matrices2([msg for msg in msgs if ";" in msg.name])
Mss2 = distance_matrices2([msg for msg in msgs])

In [None]:
segments.keys()

In [None]:
idxss = sorted_matrix_indices(Mss["pPhlF_start"])
idxss2 = sorted_matrix_indices(Mss2["pPhlF_start"])

In [None]:
plt.imshow(sort_matrix(Mss["pPhlF_end"], idxss));

# Variants

In [None]:
count_dist

In [None]:
msgs = barcode_msgs_cluster[5][0]

In [None]:
# gam_filename2 = "duplex_sup1_vg2.gam"
gam_filename2 = "duplex_sup1_subsample_vg2.gam"
msgs3 = []
lens = []
for msg in tqdm(it.islice(stream.parse(gam_filename2, vg_pb2.Alignment), 10_000_000)):
    lens.append(len(msg.sequence))
    if 800 <= len(msg.sequence) <= 900:
        msgs3.append(msg)
    # path = set([m.position.name for m in msg.path.mapping])
    # if not (("BIT0ON" in path or "BIT0OFF" in path) and "mScarletI" in path):
    #     continue
    # msgs2.append(msg)

In [None]:
msgs3 = list(tqdm(stream.parse("duplex_sup1_subsample_vg2.gam", vg_pb2.Alignment)))

In [None]:
for i in range(100):
    msg = msgs3[i]
    print(len(msg.sequence))
    print(" ".join(m.position.name for m in msg.path.mapping))
    print()

In [None]:
len(
    "TCCTCAATCGCACTGGAAACATCAAGGTCGACGAAAGACCGCTGAGGAGCCAGATACATAGATTACCACAACTCCGAGCCCTTCCACCAAAAAAAACAGATAGCCGCGCGAACGCGGCTAACTGTTGAAAAAAAACAGATAACAGATACCGAAGTATCTGTTATCTTTCCCAAAAAACCCCTCAAGACCCGTTTAGAGGCCCCAAGGGGTTATTACTGATGGCAATGTGATGTCCTCATCTTACTCCCTCTAGTCTATCATTACCCTCCTCCTGCTCTTAACTACCCTCATTCCGACCCTTACTACTACATCATCGACCTTTCTCCATACCCAACTGTCCTAACAACCAACTACTCCGCCTCTTCATCCTCTTTCAACGTTCTCCCTCTATCAACTCAGCAACCACACTCAACTACCATGACATTACACCTCATTCTCCCGACTTTCCACATACTTCCCAGTTTACTCCCTACACCTCCAAGATTCCATACCCACTCTCTTCGCTCTCTACACCCACCAATAAGTTCCTAACAAATCACATCCCGTATCTGTTATGTAATTGCTAGTTAAACAACCCATCCCACCAGATAAATCATTCCCACTACCCGTCAATCCACCATTCCTCAACGAAACTTCATCACTCTCCTCCGCACCCTAACATACAACTCTCGAATACTCTCCCACCTCAACTGCTTCTTCTCTTACACCCTCTGTCTATCATCTCCAAACCACAGACATCTTCTCTCCAACCTTCGCCCTCTTACTTATCTACCCAGACTCCACTACTACTCACTCTGTCACCATAATTCCTCCTCCTGATCCTCCTTCAATACATCCCGAAACACACACTAAACCACCCGTCACCTTTCTCCTTTCCTCTGAGGCTAGCTAACGTTACTGTACGGTATTGTAGAAAAAGGCATAGTGCTGCTAACGTTCGTCCCTATAGTGAGTCGTATTATGTAGTTCCTTATCATCTGC"
)

In [None]:
len(
    "CTGAGGAGCCAGATACATAGATTACCACAACTCCGAGCCCTTCCACCAAAAAAAACAGATAGCCGCGCGAACGCGGCTAACTGTTGAAAAAAAACAGATAACAGATACCGAAGTATCTGTTATCTTTCCCAAAAAACCCCTCAAGACCCGTTTAGAGGCCCCAAGGGGTTATTACTGATGGCAATGTGATGTCCTCATCTTACTCCCTCTAGTCTATCATTACCCTCCTCCTGCTCTTAACTACCCTCATTCCGACCCTTACTACTACATCATCGACCTTTCTCCATACCCAACTGTCCTAACAACCAACTACTCCGCCTCTTCATCCTCTTTCAACGTTCTCCCTCTATCAACTCAGCAACCACACTCAACTACCATGACATTACACCTCATTCTCCCGACTTTCCACATACTTCCCAGTTTACTCCCTACACCTCCAAGATTCCATACCCACTCTCTTCGCTCTCTACACCCACCAATAAGTTCCTAACAAATCACATCCCGTATCTGTTATGTAATTGCTAGTTAAACAACCCATCCCACCAGATAAATCATTCCCACTACCCGTCAATCCACCATTCCTCAACGAAACTTCATCACTCTCCTCCGCACCCTAACATACAACTCTCGAATACTCTCCCACCTCAACTGCTTCTTCTCTTACACCCTCTGTCTATCATCTCCAAACCACAGACATCTTCTCTCCAACCTTCGCCCTCTTACTTATCTACCCAGACTCCACTACTACTCACTCTGTCACCATAATTCCTCCTCCTGATCCTCCTTCAATACATCCCGAAACACACACTAAACCACCCGTCACCTTTCTCCTTTCCTCT"
)

In [None]:
import pyfastx

In [None]:
fq = pyfastx.Fastq("duplex_sup1_subsample.fastq")

In [None]:
fq.

In [None]:
fq["e8a89209-b020-4f67-ab33-6a97fa35366e;6ce12155-a33f-4657-bb71-eebc07dd1ff7"].seq

In [None]:
%%time
# offsets = []
# ids = set()
for msg in msgs3:
    if 800 <= len(msg.sequence) <= 900 and msg.path.mapping[0].position.name == "BetI":
        if 760 <= msg.path.mapping[0].position.offset <= 780 and ";" in msg.name:
            # ids.add(msg.name)
            print(f">{msg.name}")
            print(fq[msg.name].seq)
            # print(msg.sequence)
        # print(msg);0/0
        # offsets.append(msg.path.mapping[0].position.offset)

In [None]:
len(ids)

In [None]:
plt.hist(offsets, range=(720, 820), bins=100);

In [None]:
msg.name

In [None]:
plt.hist(lens, bins=100);

In [None]:
plt.hist(lens, bins=100);

In [None]:
parts = {
    "LacI": "ATGAAACCAGTAACGTTATACGATGTCGCAGAGTATGCCGGTGTCTCTTATATGACCGTTTCCCGCGTGGTGAACCAGGCCAGCCACGTTTCTGCGAAAACGCGGGAAAAAGTGGAAGCGGCGATGGTGGAGCTGAATTACATTCCCAACCGCGTGGCACAACAACTGGCGGGCAAACAGTCGTTGCTGATTGGCGTTGCCACCTCCAGTCTGGCCCTGCACGCGCCGTCGCAAATTGTCGCGGCGATTAAATCTCGCGCCGATCAACTGGGTGCCAGCGTGGTGGTGTCGATGGTAGAACGAAGCGGCGTCGAAGCCTGTAAAGCGGCGGTGCACAATCTTCTCGCGCAACGCGTCAGTGGGCTGATCATTAACTATCCGCTGGATGACCAGGATGCCATTGCTGTGGAAGCTGCCTGCACTAATGTTCCGGCGTTATTTCTTGATGTCTCTGACCAGACACCCATCAACAGTATTATTTACTCCCATGAGGACGGTACGCGACTGGGCGTGGAGCATCTGGTCGCATTGGGTCACCAGCAAATCGCGCTGTTAGCGGGCCCATTAAGTTCTGTCTCGGCGCGTCTGCGTCTGGCTGGCTGGCATAAATATCTCACTCGCAATCAAATTCAGCCGATAGCGGAACGGGAAGGCGACTGGAGTGCCATGTCCGGTTTTCAACAAACCATGCAAATGCTGAATGAGGGCATCGTTCCCACTGCGATGCTGGTTGCCAACGATCAGATGGCGCTGGGCGCAATGCGCGCCATTACCGAGTCCGGGCTGCGCGTTGGTGCGGATATCTCGGTAGTGGGATACGACGATACCGAAGATAGCTCATGTTATATCCCGCCGTTAACCACCATCAAACAGGATTTTCGCCTGCTGGGGCAAACCAGCGTGGACCGCTTGCTGCAACTCTCTCAGGGCCAGGCGGTGAAGGGCAATCAGCTGTTGCCAGTCTCACTGGTGAAAAGAAAAACCACCCTGGCGCCCAATACGCAAACCGCCTCTCCCCGCGCGTTGGCCGATTCATTAATGCAGCTGGCACGACAGGTTTCCCGACTGGAAAGCGGGCAGT",
    "PhlF": "ATGGCACGTACCCCGAGCCGTAGCAGCATTGGTAGCCTGCGTAGTCCGCATACCCATAAAGCAATTCTGACCAGCACCATTGAAATCCTGAAAGAATGTGGTTATAGCGGTCTGAGCATTGAAAGCGTGGCACGTCGCGCCGGTGCAGGCAAACCGACCATTTATCGTTGGTGGACCAACAAAGCAGCACTGATTGCCGAAGTGTATGAAAATGAAATCGAACAGGTACGTAAATTTCCGGATTTGGGTAGCTTTAAAGCCGATCTGGATTTTCTGCTGCATAATCTGTGGAAAGTTTGGCGTGAAACCATTTGTGGTGAAGCATTTCGTTGTGTTATTGCAGAAGCACAGTTGGACCCTGTAACCCTGACCCAACTGAAAGATCAGTTTATGGAACGTCGTCGTGAGATACCGAAAAAACTGGTTGAAGATGCCATTAGCAATGGTGAACTGCCGAAAGATATCAATCGTGAACTGCTGCTGGATATGATTTTTGGTTTTTGTTGGTATCGCCTGCTGACCGAACAGTTGACCGTTGAACAGGATATTGAAGAATTTACCTTCCTGCTGATTAATGGTGTTTGTCCGGGTACACAGTGTTAA",
    "BetI": "ATGCCGAAACTGGGTATGCAGAGCATTCGTCGTCGTCAGCTGATTGATGCAACCCTGGAAGCAATTAATGAAGTTGGTATGCATGATGCAACCATTGCACAGATTGCACGTCGTGCCGGTGTTAGCACCGGTATTATTAGCCATTATTTCCGCGATAAAAACGGTCTACTGGAAGCAACCATGCGTGATATTACCAGCCAGCTGCGTGATGCAGTTCTGAATCGTCTGCATGCACTGCCGCAGGGTAGCGCAGAACAGCGTCTGCAGGCAATTGTTGGTGGTAATTTTGATGAAACCCAGGTTAGCAGCGCAGCAATGAAAGCATGGCTGGCATTTTGGGCAATCAGCATGCATCAGCCGATGCTGTATCGTCTGCAGCAGGTTAGCAGTCGTCGTCTGCTGAGCAATCTGGTTAGCGAATTTCGTCGTGAACTGCCTCGTGAACAGGCACAAGAGGCAGGTTATGGTCTGGCAGCACTGATTGATGGTCTGTGGCTGCGTGCAGCACTGAGCGGTAAACCGCTGGATAAAACCCGTGCAAATAGCCTGACCCGTCATTTTATCACCCAGCATCTGCCGACCGATTAA",
}

In [None]:
parts.keys()

In [None]:
?edlib.align

In [None]:
%%time
# offsets = []
snp_threshold = 50
ids = set()
misassembled_simplex = defaultdict(list)
misassembled_duplex = defaultdict(list)
for msg in tqdm(msgs3):
    if 800 <= len(msg.sequence) <= 900:
        read = fq[msg.name].seq
        # print(msg.name)
        d1 = tuple(
            edlib.align(part, read, mode="HW")["editDistance"]
            for part in parts.values()
        )
        d2 = tuple(
            edlib.align(reverse_complement(part), read, mode="HW")["editDistance"]
            for part in parts.values()
        )
        if min(d1) <= min(d2):
            d = d1
        else:
            d = d2
        # print(k)
        # print()
        # continue
        key = tuple(dd <= snp_threshold for dd in d)
        if ";" in msg.name:
            misassembled_duplex[key].append(read)
        else:
            misassembled_simplex[key].append(read)
        # ids.add(msg.name)
        # print(f">{msg.name}")
        # print(fq[msg.name].seq)
        # print(msg.sequence)
        # print(msg);0/0
        # offsets.append(msg.path.mapping[0].position.offset)

In [None]:
{k: len(v) for k, v in misassembled_duplex.items()}

In [None]:
{k: len(v) for k, v in misassembled_simplex.items()}

In [None]:
idx = 9
read = misassembled_duplex[(False, False, False)][idx]
end = "ATCACATTGCCATCAGTAATAACCCCTTGGGGCCTCTAAACGGGTCTTGAGGGGTTTTTTGGGAAAGATAACAGATACTTCGGTATCTGTTATCTGTTTTTTTTCAACAGATAGCCGCGTTCGCGCGGCTATCTGTTTTTTTTGGTGGAAGGGCTCGGAGTTGTGGTAATCTATGTATCCTGG"
print(edlib.align(end, read, mode="HW"))
print(edlib.align(reverse_complement(end), read, mode="HW"))

In [None]:
BARCODE_END = "ATCACATTGCCATCAGTAATAACCCCTTGGGGCCTCTAAACGGGTCTTGAGGGGTTTTTTG"


def trim_barcode(read, end=BARCODE_END, threshold=10):
    if (d1 := edlib.align(end, read, mode="HW"))["editDistance"] <= threshold:
        # print("1",d1)
        return (
            read[d1["locations"][0][1] - len(end) + 1 :],
            read[: d1["locations"][0][1] - len(end) + 1],
        )
    elif (d2 := edlib.align(reverse_complement(end), read, mode="HW"))[
        "editDistance"
    ] <= threshold:
        # print("2",d2)
        return reverse_complement(
            read[: d2["locations"][0][1] + 1]
        ), reverse_complement(read[d2["locations"][0][1] + 1 :])
    else:
        return read

In [None]:
read

In [None]:
trim_barcode(read)

In [None]:
trim_barcode(reverse_complement(read))

In [None]:
# TTT: full circuit, no barcode (?)

In [None]:
{k: len(v) for k, v in misassembled_duplex.items()}

In [None]:
for read in misassembled_duplex[(False, False, False)]:
    print(">FFF")
    print(trim_barcode(read)[0])

In [None]:
{k: len(v) for k, v in misassembled_simplex.items()}

In [None]:
Counter([extract_read_segments(msg)["RBS2"] for msg in msgs if ";" in msg.name])

In [None]:
Counter([extract_read_segments(msg)["RBS2"] for msg in msgs])