In [None]:
import itertools as it
from collections import Counter, defaultdict
from functools import partial
from glob import glob
from pathlib import Path

import gfapy
import holoviews as hv
import hvplot.pandas
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.dataset as ds
from tqdm.auto import tqdm, trange

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import paulssonlab.sequencing.align as align
import paulssonlab.sequencing.cigar as scigar
import paulssonlab.sequencing.consensus as con
import paulssonlab.sequencing.gfa as sgfa
import paulssonlab.sequencing.io as sio
import paulssonlab.sequencing.processing as processing
from paulssonlab.util.sequence import reverse_complement

In [None]:
hv.extension("bokeh")

In [None]:
pl.enable_string_cache()

# Functions

In [None]:
def concat_glob(filename):
    return pl.concat([pl.scan_ipc(f) for f in glob(filename)], how="diagonal")

In [None]:
def label_columns(cols, func=None):
    expr = None
    for col in cols:
        if expr is None:
            expr = pl.when(pl.col(col).is_not_null())
        else:
            expr = expr.when(pl.col(col).is_not_null())
        if func is not None:
            lit = func(col)
        else:
            lit = col
        expr = expr.then(pl.lit(lit))
    return expr

# Data

In [None]:
bcd_sd_oligos = [
    "GACGGAAGACTGATGCNGTAAGGAGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGNTAAGGAGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGNAAGGAGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGTNAGGAGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGTANGGAGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGTAANGAGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGTAAGNAGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGTAAGGNGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGTAAGGANTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCNGAGGCAGCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGNAGGCAGCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGNGGCAGCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGANGCAGCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGAGNCAGCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGAGGNAGCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGAGGCNGCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGAGGCANCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGAGGCAGNTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCNTGCAGAGGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTNGCAGAGGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTTNCAGAGGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTTGNAGAGGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTTGCNGAGGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTTGCANAGGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTTGCAGNGGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTTGCAGANGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTTGCAGAGNTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCNTATCGGGGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTNATCGGGGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTTNTCGGGGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTTANCGGGGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTTATNGGGGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTTATCNGGGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTTATCGNGGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTTATCGGNGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTTATCGGGNTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCNGCGCGGTGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGNCGCGGTGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGNGCGGTGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGCNCGGTGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGCGNGGTGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGCGCNGTGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGCGCGNTGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGCGCGGNGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGCGCGGTNTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCNCCGGTGTTTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGNCGGTGTTTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGCNGGTGTTTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGCCNGTGTTTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGCCGNTGTTTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGCCGGNGTTTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGCCGGTNTTTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGCCGGTGNTTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGCCGGTGTNTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCNATGTGTTTTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTNTGTGTTTTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTANGTGTTTTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTATNTGTTTTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTATGNGTTTTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTATGTNTTTTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTATGTGNTTTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTATGTGTNTTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTATGTGTTNTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCNGTCAAAATTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCCNTCAAAATTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCCGNCAAAATTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCCGTNAAAATTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCCGTCNAAATTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCCGTCANAATTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCCGTCAANATTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCCGTCAAANTTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCCGTCAAAANTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCNGATTCTAGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGNATTCTAGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGNTTCTAGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGANTCTAGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGATNCTAGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGATTNTAGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGATTCNAGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGATTCTNGTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCGGATTCTANTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCNCATGACCCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTNATGACCCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTCNTGACCCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTCANGACCCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTCATNACCCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTCATGNCCCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTCATGANCCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTCATGACNCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTCATGACCNTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCNCACGTCCCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTNACGTCCCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTCNCGTCCCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTCANGTCCCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTCACNTCCCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTCACGNCCCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTCACGTNCCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTCACGTCNCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTCACGTCCNTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCNCAGGCCCCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTNAGGCCCCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTCNGGCCCCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTCANGCCCCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTCAGNCCCCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTCAGGNCCCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTCAGGCNCCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTCAGGCCNCTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
    "GACGGAAGACTGATGCTCAGGCCCNTTAACAGTCTTCTTAAGGTGCCGGGCCCACAT",
]
bcd_sds_Ns = [s[16:29] for s in bcd_sd_oligos]
bcd_sds_13 = sum(
    [
        [
            s.replace("N", "A"),
            s.replace("N", "T"),
            s.replace("N", "C"),
            s.replace("N", "G"),
        ]
        for s in bcd_sds_Ns
    ],
    [],
)
bcd_sds = [f"{s}A" for s in bcd_sds_13]

# 240111_bcd_rbses_revio

In [None]:
arrow_filename = (
    "/home/jqs1/scratch/sequencing/240111_pLIB442-447_revio/consensus/*.arrow"
)
df = pl.read_ipc(arrow_filename)

In [None]:
seqs = df.with_columns(
    len=pl.col("consensus_seq").str.len_bytes()
)  # .filter(pl.col("len") < 3500)
seqs

In [None]:
seqs["len"].to_pandas().hvplot.hist(bins=100)

In [None]:
seqs["grouping_depth"].to_pandas().hvplot.hist(bins=100)

In [None]:
seqs[10, "consensus_seq"]

# 231201_bcd_rbses

In [None]:
%%time
# arrow_filename = "/home/jqs1/scratch/sequencing/231201_bcd_rbses_run3/20231201_1101_1F_PAU05823_773c75ee/extract_segments/*.arrow"
arrow_filename = "/home/jqs1/scratch/sequencing/231201_bcd_rbses_run3/20231201_1101_1F_PAU05823_773c75ee/consensus/*.arrow"
arrow_filename = "/home/jqs1/scratch/sequencing/231130_bcd_rbses_run3_minion/20231130_1904_MN35044_FAX40126_ee95ee31/extract_segments/*.arrow"
df = concat_glob(arrow_filename).collect()

In [None]:
df["grouping_depth"].sort(descending=True).to_pandas().hvplot.step(logy=True)

In [None]:
x = np.asarray(df["grouping_depth"])

In [None]:
x[x > 300].sum()

In [None]:
x.sum()

In [None]:
%%time
df2 = df.with_columns(
    dup=pl.col("name").is_duplicated(),
    e2e=pl.col("variants_path")
    .list.set_intersection(
        [
            "<BC:UPSTREAM",
            "<pLIB430-435:upstream",
            ">BC:UPSTREAM",
            ">pLIB430-435:upstream",
        ]
    )
    .list.len()
    == 2,
    bc_e2e=pl.col("variants_path")
    .list.set_intersection(
        [
            "<BC:UPSTREAM",
            "<BC:SPACER2",
            ">BC:UPSTREAM",
            ">BC:SPACER2",
        ]
    )
    .list.len()
    == 2,
)

In [None]:
df2.with_columns(
    num_reads=pl.col("grouping_depth") + pl.col("grouping_duplex_depth"),
    good_alignment=pl.col("e2e") & pl.col("dup").not_(),
).group_by("good_alignment").agg(pl.col("num_reads").sum()).with_columns(
    frac=pl.col("num_reads") / pl.col("num_reads").sum()
)

In [None]:
%%time
df_variants = df2.filter(pl.col("e2e"), ~pl.col("dup")).with_columns(
    pl.coalesce(
        label_columns(
            [
                "pLIB433:PhlF_pPhlF|seq",
                "pLIB434:LacI_pTac|seq",
                "pLIB435:BetI_pBetI|seq",
            ],
            lambda x: x.split("|")[0],
        ),
        pl.concat_str(pl.lit("pLIB431-432:RBS="), pl.col("pLIB431-432:RBS|variant")),
    ).alias("RBS")
)

In [None]:
df_variants[["RBS", "pLIB430-435:promoter|variant"]].select(
    pl.struct(pl.all()).alias("variant")
).to_series().value_counts().unnest("variant").to_pandas()

In [None]:
df_variants.filter(pl.col("depth") >= 5)[
    ["RBS", "pLIB430-435:promoter|variant"]
].select(pl.struct(pl.all()).alias("variant")).to_series().value_counts().unnest(
    "variant"
).to_pandas().set_index(
    ["RBS", "pLIB430-435:promoter|variant"]
).unstack(
    "pLIB430-435:promoter|variant"
)

In [None]:
df_variants.filter(pl.col("RBS") == "pLIB435:BetI_pBetI")[
    "BCD_RBS:RBS|seq"
].value_counts(sort=True)

In [None]:
hv.Overlay(
    [
        df_variants.filter(pl.col("RBS") == lib)["BCD_RBS:RBS|seq"]
        .value_counts(sort=True)
        .to_pandas()
        .hvplot.step(logy=True, label=lib)
        for lib in ["pLIB433:PhlF_pPhlF", "pLIB434:LacI_pTac", "pLIB435:BetI_pBetI"]
    ]
) * df_variants.filter(pl.col("RBS").is_null())["BCD_RBS:RBS|seq"].value_counts(
    sort=True
).to_pandas().hvplot.step(
    logy=True, label="BCD"
)

In [None]:
df_variants.filter(pl.col("RBS") == "pLIB431-432:RBS=B0032m")[
    "pLIB431-432:RBS|seq"
].value_counts(sort=True).filter(pl.col("count") > 10)

In [None]:
df_variants.filter(pl.col("RBS") == "pLIB431-432:RBS=B0033m")[
    "pLIB431-432:RBS|seq"
].value_counts(sort=True).filter(pl.col("count") > 10)

In [None]:
df_variants.filter(pl.col("RBS") == "pLIB431-432:RBS=B0034m")[
    "pLIB431-432:RBS|seq"
].value_counts(sort=True).filter(pl.col("count") > 10)

In [None]:
df_variants.filter(pl.col("RBS") == "pLIB431-432:RBS=StrongRBSLib")[
    "pLIB431-432:RBS|seq"
].value_counts(sort=True).to_pandas().hvplot.step(
    logy=True,
)

## Representative seqs

In [None]:
df_variants.filter(pl.col("RBS") == "pLIB435:BetI_pBetI")[
    "BCD_RBS:RBS|seq"
].str.len_bytes().to_pandas().hvplot.hist(bins=100)

In [None]:
df_variants.select(pl.col(r"^BCD_RBS:RBS\|.*$"))

In [None]:
df_variants.filter(pl.col("RBS") == "pLIB435:BetI_pBetI")[
    "BCD_RBS:RBS|seq"
].str.len_bytes().value_counts(sort=True).with_columns(
    frac=pl.col("count") / pl.col("count").sum()
)

In [None]:
df_variants.filter(pl.col("RBS") == "pLIB435:BetI_pBetI")[
    "BCD_RBS:RBS|seq"
].value_counts(sort=True)

In [None]:
df_rbs_seqs = df_variants.filter(
    pl.col("RBS") == "pLIB435:BetI_pBetI",
    pl.col("BCD_RBS:RBS|seq").str.len_bytes() == 14,
)["BCD_RBS:RBS|seq"].value_counts(sort=True)
df_rbs_seqs.to_pandas()

In [None]:
df_rbs_seqs.to_pandas().hvplot.step(logy=True)

In [None]:
df_seqs = df_variants.filter(
    pl.col("RBS") == "pLIB435:BetI_pBetI",
    pl.col("BCD_RBS:RBS|seq") == df_rbs_seqs["BCD_RBS:RBS|seq"][0],
)["consensus_seq"]
for idx in range(10):
    print(f">pLIB435_rbs0_{idx}")
    print(df_seqs[idx])

## Comparison with BCD SD library

In [None]:
%%time
df_lib = (
    df_variants.filter(pl.col("RBS") == "pLIB435:BetI_pBetI")["BCD_RBS:RBS|seq"]
    .value_counts(sort=True)
    .with_columns(
        frac=pl.col("count") / pl.col("count").sum(),
        len=pl.col("BCD_RBS:RBS|seq").str.len_bytes(),
        in_library=pl.col("BCD_RBS:RBS|seq").is_in(bcd_sds),
    )
    .filter(pl.col("count") > 20)
)
df_lib

In [None]:
df_lib.group_by("in_library").agg(pl.col("frac").sum())

In [None]:
df_lib.filter(pl.col("BCD_RBS:RBS|seq").str.len_bytes() == 14).group_by(
    "in_library"
).agg(pl.col("frac").sum())

## Terminator mutations

In [None]:
def variant_dist(df, rbs, filter_rbs=True):
    if filter_rbs:
        df = df.filter(pl.col("RBS") == rbs)
    seq_col = f"{rbs}|seq"
    return (
        df[seq_col]
        .value_counts(sort=True)
        .with_columns(
            frac=pl.col("count") / pl.col("count").sum(),
            len=pl.col(seq_col).str.len_bytes(),
        )
    )

In [None]:
# seqs = variant_dist(df_variants, "pLIB433:PhlF_pPhlF")
# seqs = variant_dist(df_variants, "pLIB434:LacI_pTac")
# seqs = variant_dist(df_variants.filter(pl.col("depth") > 10), "pLIB435:BetI_pBetI")
# seqs = variant_dist(df_variants.filter(pl.col("depth") > 10), "pLIB430-435:upstream", filter_rbs=False)
seqs = variant_dist(df_variants, "pLIB430-435:downstream", filter_rbs=False)
seqs

In [None]:
seqs[0, 0]

In [None]:
df_dt3s = [
    variant_dist(
        df_variants.filter(pl.col("RBS") == rbs),
        "pLIB430-435:downstream",
        filter_rbs=False,
    )
    for rbs in ("pLIB433:PhlF_pPhlF", "pLIB434:LacI_pTac", "pLIB435:BetI_pBetI")
]

In [None]:
df_dt3 = df_dt3s[0]
for idx, other_df in enumerate(df_dt3s[1:]):
    df_dt3 = df_dt3.join(other_df, on="pLIB430-435:downstream|seq", suffix=f"_{idx}")

In [None]:
%%opts Scatter [width=300]
(
    df_dt3[["frac", "frac_0"]].to_pandas().hvplot.scatter("frac", "frac_0")
    + df_dt3[["frac", "frac_1"]].to_pandas().hvplot.scatter("frac", "frac_1")
    + df_dt3[["frac_0", "frac_1"]].to_pandas().hvplot.scatter("frac_0", "frac_1")
)

In [None]:
df_dt3

## Comparison with oligo pool

In [None]:
oligo_pool = pl.Series("seq", bcd_sds)

In [None]:
df_seq_vs_pool = (
    df_variants.filter(pl.col("BCD_RBS:RBS|seq").is_not_null())["BCD_RBS:RBS|seq"]
    .value_counts(sort=True)
    .join(oligo_pool.value_counts(), left_on="BCD_RBS:RBS|seq", right_on="seq")
)
df_seq_vs_pool

In [None]:
df_seq_vs_pool.to_pandas().hvplot.scatter("count", "count_right")

In [None]:
bcd_sds_Ns_idx = [(idx, s[16:29]) for idx, s in enumerate(bcd_sd_oligos)]
bcd_sds_13_idx = sum(
    [
        [
            (idx, s.replace("N", "A"), "A"),
            (idx, s.replace("N", "T"), "T"),
            (idx, s.replace("N", "C"), "C"),
            (idx, s.replace("N", "G"), "G"),
        ]
        for idx, s in bcd_sds_Ns_idx
    ],
    [],
)
bcd_sds_idx = [(idx, f"{s}A", n_to) for idx, s, n_to in bcd_sds_13_idx]

In [None]:
oligo_pool_idx = pl.DataFrame(bcd_sds_idx, schema=["oligo_num", "seq", "N_to"])

In [None]:
oligo_pool_idx2 = (
    oligo_pool_idx.group_by("seq")
    .agg(pl.col("oligo_num"), pl.col("N_to"))
    .with_columns(appearences=pl.col("oligo_num").list.len())
    .sort("appearences", descending=True)
    .filter(pl.col("appearences") == 1)
    .with_columns(pl.col("oligo_num").list.get(0), pl.col("N_to").list.get(0))
)
oligo_pool_idx2

In [None]:
df_seq_vs_pool2 = (
    df_variants.filter(pl.col("BCD_RBS:RBS|seq").is_not_null())["BCD_RBS:RBS|seq"]
    .value_counts(sort=True)
    .join(oligo_pool_idx2, left_on="BCD_RBS:RBS|seq", right_on="seq")
)
df_seq_vs_pool2

In [None]:
df_seq_vs_pool2.to_pandas().hvplot.scatter("oligo_num", "count", color="N_to")

In [None]:
df_seq_vs_pool2.group_by("oligo_num").agg(
    mean=pl.col("count").mean(), std=pl.col("count").std()
)

In [None]:
df_seq_vs_pool2_base = df_seq_vs_pool2.with_columns(
    base_frac=pl.col("count") / pl.col("count").sum().over("oligo_num")
)

In [None]:
df_seq_vs_pool2_base.group_by("N_to").agg(pl.col("base_frac").mean() / 4 * 3)

In [None]:
df_seq_vs_pool2_base.to_pandas().hvplot.scatter("oligo_num", "base_frac", color="N_to")

# 231130_bcd_rbses_run3_minion

In [None]:
df = concat_glob(
    "/home/jqs1/scratch/sequencing/231130_bcd_rbses_run3_minion/20231130_1904_MN35044_FAX40126_ee95ee31/extract_segments/*.arrow"
).collect()

In [None]:
df2 = df.with_columns(
    dup=pl.col("name").is_duplicated(),
    e2e=pl.col("variants_path")
    .list.set_intersection(
        [
            "<BC:UPSTREAM",
            "<pLIB430-435:upstream",
            ">BC:UPSTREAM",
            ">pLIB430-435:upstream",
        ]
    )
    .list.len()
    == 2,
    bc_e2e=pl.col("variants_path")
    .list.set_intersection(
        [
            "<BC:UPSTREAM",
            "<BC:SPACER2",
            ">BC:UPSTREAM",
            ">BC:SPACER2",
        ]
    )
    .list.len()
    == 2,
)

In [None]:
df2.filter(pl.col("e2e"), ~pl.col("dup")).with_columns(
    pl.coalesce(
        label_columns(
            [
                "pLIB433:PhlF_pPhlF|seq",
                "pLIB434:LacI_pTac|seq",
                "pLIB435:BetI_pBetI|seq",
            ],
            lambda x: x.split("|")[0],
        ),
        pl.concat_str(pl.lit("pLIB431-432:RBS="), pl.col("pLIB431-432:RBS|variant")),
    ).alias("RBS")
)[["RBS", "pLIB430-435:promoter|variant"]].select(pl.struct(pl.all()).alias("foo"))[
    "foo"
].value_counts()

In [None]:
df_variants = (
    df2.filter(pl.col("e2e"), ~pl.col("dup"))
    .with_columns(
        pl.coalesce(
            label_columns(
                [
                    "pLIB433:PhlF_pPhlF|seq",
                    "pLIB434:LacI_pTac|seq",
                    "pLIB435:BetI_pBetI|seq",
                ],
                lambda x: x.split("|")[0],
            ),
            pl.concat_str(
                pl.lit("pLIB431-432:RBS="), pl.col("pLIB431-432:RBS|variant")
            ),
        ).alias("RBS")
    )[["RBS", "pLIB430-435:promoter|variant"]]
    .to_pandas()
)

In [None]:
df_variants.value_counts().unstack(1)

# 230707_repressilators

In [None]:
df = concat_glob(
    "/home/jqs1/scratch/sequencing/230707_repressilators/20230707_2040_MN35044_FAS94231_25542e0d/extract_segments/*.arrow"
).collect()

In [None]:
df2 = df.with_columns(
    dup=pl.col("name").is_duplicated(),
    e2e=pl.col("variants_path")
    .list.set_intersection(
        [
            "<BC:UPSTREAM",
            "<UNS3",
            ">BC:UPSTREAM",
            ">UNS3",
        ]
    )
    .list.len()
    == 2,
    bc_e2e=pl.col("variants_path")
    .list.set_intersection(
        [
            "<BC:UPSTREAM",
            "<BC:SPACER2",
            ">BC:UPSTREAM",
            ">BC:SPACER2",
        ]
    )
    .list.len()
    == 2,
)

In [None]:
len(df2.filter(pl.col("e2e"), ~pl.col("dup"))) / len(df2)

In [None]:
df2.filter(pl.col("e2e"), ~pl.col("dup"))[
    ["RBS1:RBS|seq", "RBS2:RBS|seq", "RBS3:RBS|seq"]
]

In [None]:
df2.filter(pl.col("e2e"), ~pl.col("dup"))[
    ["RBS1:RBS|seq", "RBS2:RBS|seq", "RBS3:RBS|seq"]
].select(pl.struct(pl.all()).alias("variant")).to_series().value_counts(sort=True)

In [None]:
df2.filter(pl.col("e2e"), ~pl.col("dup"), pl.col("grouping_depth") > 5)[
    "RBS1:RBS|seq"
].value_counts(sort=True).filter(pl.col("count") > 20)

In [None]:
hv.Overlay(
    [
        df2.filter(pl.col("e2e"), ~pl.col("dup"))[col]
        .value_counts(sort=True)["count"]
        .to_pandas()
        .hvplot.step(logy=True, label=col)
        for col in ["RBS1:RBS|seq", "RBS2:RBS|seq", "RBS3:RBS|seq"]
    ]
)

In [None]:
df2.filter(pl.col("e2e"), ~pl.col("dup"))[["RBS1:RBS|seq", "RBS2:RBS|seq"]].select(
    pl.struct(pl.all()).alias("variant")
).to_series().value_counts(sort=True)

In [None]:
df2.filter(pl.col("e2e"), ~pl.col("dup"))[["RBS2:RBS|seq", "RBS3:RBS|seq"]].select(
    pl.struct(pl.all())
).to_series().value_counts(sort=True)

In [None]:
df2.filter(pl.col("e2e"), ~pl.col("dup"))[
    ["RBS1:RBS|seq", "RBS2:RBS|seq", "RBS3:RBS|seq"]
].select(pl.struct(pl.all())).to_series().value_counts(sort=True)

In [None]:
df2.filter(pl.col("e2e"), ~pl.col("dup"))[
    ["RBS1:RBS|seq", "RBS2:RBS|seq", "RBS3:RBS|seq"]
].select(pl.struct(pl.all())).to_series().value_counts(sort=True)[
    "count"
].to_pandas().hvplot.step()

In [None]:
hv.Overlay(
    [
        df2.filter(pl.col("e2e"), ~pl.col("dup"))[list(cols)]
        .select(pl.struct(pl.all()))
        .to_series()
        .value_counts(sort=True)["count"]
        .to_pandas()
        .hvplot.step(label=", ".join(cols))
        for cols in it.combinations(["RBS1:RBS|seq", "RBS2:RBS|seq", "RBS3:RBS|seq"], 2)
    ]
)

In [None]:
col1 = "RBS1:RBS|seq"
# col1 = "RBS2:RBS|seq"
col2 = "RBS2:RBS|seq"
# col2 = "RBS3:RBS|seq"
df2_filtered = df2.filter(pl.col("e2e"), ~pl.col("dup"))
df_joined = (
    df2_filtered[col1]
    .value_counts()
    .filter(pl.col("count") > 100)
    .join(df2_filtered[col2].value_counts(), left_on=col1, right_on=col2)
)
r2 = np.corrcoef(df_joined["count"], df_joined["count_right"])[0, 1] ** 2
df_joined[["count", "count_right"]].to_pandas().hvplot.scatter(
    "count", "count_right", title=f"r^2: {r2:.2f}"
)

## Terminator mutations

In [None]:
seqs = (
    df2.filter(pl.col("e2e"), ~pl.col("dup"), pl.col("depth") >= 3)["LacI|seq"]
    .value_counts(sort=True)
    .with_columns(
        frac=pl.col("count") / pl.col("count").sum(),
        len=pl.col("LacI|seq").str.len_bytes(),
    )
)
seqs

In [None]:
seqs[0, 0]

In [None]:
seqs = (
    df2.filter(pl.col("e2e"), ~pl.col("dup"), pl.col("depth") >= 3)["PhlF|seq"]
    .value_counts(sort=True)
    .with_columns(
        frac=pl.col("count") / pl.col("count").sum(),
        len=pl.col("PhlF|seq").str.len_bytes(),
    )
)
seqs

In [None]:
for idx in range(10):
    print(f">phlf{idx}")
    print(seqs[idx, 0])

In [None]:
seqs = (
    df2.filter(pl.col("e2e"), ~pl.col("dup"), pl.col("depth") >= 3)["BetI|seq"]
    .value_counts(sort=True)
    .with_columns(
        frac=pl.col("count") / pl.col("count").sum(),
        len=pl.col("BetI|seq").str.len_bytes(),
    )
)
seqs

In [None]:
for idx in range(10):
    print(f">beti{idx}")
    print(seqs[idx, 0])

# 230818_repressilators

In [None]:
%%time
df = concat_glob(
    "/home/jqs1/scratch/sequencing/230818_repressilators/20230905_1132_1H_PAQ85679_c9d74ddb/extract_segments/*.arrow"
).collect()

In [None]:
%%time
df2 = df.with_columns(
    dup=pl.col("name").is_duplicated(),
    e2e=pl.col("variants_path")
    .list.set_intersection(
        [
            "<BC:UPSTREAM",
            "<UNS3",
            ">BC:UPSTREAM",
            ">UNS3",
        ]
    )
    .list.len()
    == 2,
    bc_e2e=pl.col("variants_path")
    .list.set_intersection(
        [
            "<BC:UPSTREAM",
            "<BC:SPACER2",
            ">BC:UPSTREAM",
            ">BC:SPACER2",
        ]
    )
    .list.len()
    == 2,
)

In [None]:
df2.filter(pl.col("e2e"), ~pl.col("dup"))[
    ["RBS1:RBS|seq", "RBS2:RBS|seq", "RBS3:RBS|seq"]
]

In [None]:
df2.filter(pl.col("e2e"), ~pl.col("dup"))[
    ["RBS1:RBS|seq", "RBS2:RBS|seq", "RBS3:RBS|seq"]
].select(pl.struct(pl.all()).alias("variant")).to_series().value_counts(sort=True)

In [None]:
df2.filter(pl.col("e2e"), ~pl.col("dup"), pl.col("depth") > 5)[
    "RBS1:RBS|seq"
].value_counts(sort=True).with_columns(
    frac=pl.col("count") / pl.col("count").sum(),
    len=pl.col("RBS1:RBS|seq").str.len_bytes(),
).filter(
    pl.col("count") > 20
)

In [None]:
df2.filter(pl.col("e2e"), ~pl.col("dup"), pl.col("depth") > 5)[
    "RBS1:RBS|seq"
].str.len_bytes().value_counts(sort=True).with_columns(
    frac=pl.col("count") / pl.col("count").sum()
).filter(
    pl.col("count") > 20
)

In [None]:
df2.filter(
    pl.col("e2e"),
    ~pl.col("dup"),
    pl.col("depth") > 5,
).with_columns(
    (pl.col("RBS1:RBS|seq").str.len_bytes() == 13)
    .and_(
        pl.col("RBS2:RBS|seq").str.len_bytes() == 13,
        pl.col("RBS3:RBS|seq").str.len_bytes() == 13,
    )
    .alias("good_RBS_length")
)["good_RBS_length"].value_counts().with_columns(
    frac=pl.col("count") / pl.col("count").sum()
)

In [None]:
hv.Overlay(
    [
        df2.filter(pl.col("e2e"), ~pl.col("dup"), pl.col("depth") > 5)[col]
        .value_counts(sort=True)["count"]
        .to_pandas()
        .hvplot.step(logy=True, label=col)
        for col in ["RBS1:RBS|seq", "RBS2:RBS|seq", "RBS3:RBS|seq"]
    ]
)

In [None]:
df2.filter(pl.col("e2e"), ~pl.col("dup"), pl.col("depth") > 5)[
    ["RBS1:RBS|seq", "RBS2:RBS|seq"]
].select(pl.struct(pl.all()).alias("variant")).to_series().value_counts(sort=True)

In [None]:
df2.filter(pl.col("e2e"), ~pl.col("dup"), pl.col("depth") > 5)[
    ["RBS2:RBS|seq", "RBS3:RBS|seq"]
].select(pl.struct(pl.all())).to_series().value_counts(sort=True)

In [None]:
df2.filter(pl.col("e2e"), ~pl.col("dup"), pl.col("depth") > 5)[
    ["RBS1:RBS|seq", "RBS2:RBS|seq", "RBS3:RBS|seq"]
].select(pl.struct(pl.all())).to_series().value_counts(sort=True)

In [None]:
df2.filter(pl.col("e2e"), ~pl.col("dup"), pl.col("depth") > 5)[
    ["RBS1:RBS|seq", "RBS2:RBS|seq", "RBS3:RBS|seq"]
].select(pl.struct(pl.all())).to_series().value_counts(sort=True)[
    "count"
].to_pandas().hvplot.step(
    logy=True
)

In [None]:
hv.Overlay(
    [
        df2.filter(pl.col("e2e"), ~pl.col("dup"), pl.col("depth") > 5)[list(cols)]
        .select(pl.struct(pl.all()))
        .to_series()
        .value_counts(sort=True)
        .to_pandas()
        .hvplot.step(label=", ".join(cols), logy=True)
        for cols in it.combinations(["RBS1:RBS|seq", "RBS2:RBS|seq", "RBS3:RBS|seq"], 2)
    ]
)

In [None]:
col1 = "RBS1:RBS|seq"
# col1 = "RBS2:RBS|seq"
col2 = "RBS2:RBS|seq"
# col2 = "RBS3:RBS|seq"
df2_filtered = df2.filter(pl.col("e2e"), ~pl.col("dup"))
df_joined = (
    df2_filtered[col1]
    .value_counts()
    .filter(pl.col("count") > 0)
    .join(df2_filtered[col2].value_counts(), left_on=col1, right_on=col2)
)
r2 = np.corrcoef(df_joined["count"], df_joined["count_right"])[0, 1] ** 2
df_joined[["count", "count_right"]].to_pandas().hvplot.scatter(
    "count", "count_right", title=f"r^2: {r2:.2f}"
)

## Comparison with BCD SD library

In [None]:
%%time
df_lib = (
    df2.filter(pl.col("e2e"), ~pl.col("dup"), pl.col("depth") > 5)["RBS1:RBS|seq"]
    .value_counts(sort=True)
    .with_columns(
        frac=pl.col("count") / pl.col("count").sum(),
        len=pl.col("RBS1:RBS|seq").str.len_bytes(),
        in_library=pl.col("RBS1:RBS|seq").is_in(bcd_sds_13),
    )
    .filter(pl.col("count") > 20)
)
df_lib

In [None]:
df_lib.group_by("in_library").agg(pl.col("frac").sum())

In [None]:
df_lib.filter(pl.col("RBS1:RBS|seq").str.len_bytes() == 13).group_by("in_library").agg(
    pl.col("frac").sum()
)