In [None]:
import itertools as it
from collections import Counter, defaultdict
from functools import partial
from glob import glob
from pathlib import Path

import gfapy
import holoviews as hv
import hvplot.pandas
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.dataset as ds
from tqdm.auto import tqdm, trange

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import paulssonlab.sequencing.align as align
import paulssonlab.sequencing.cigar as scigar
import paulssonlab.sequencing.consensus as con
import paulssonlab.sequencing.gfa as sgfa
import paulssonlab.sequencing.io as sio
import paulssonlab.sequencing.processing as processing
from paulssonlab.util.sequence import reverse_complement

In [None]:
hv.extension("bokeh")

In [None]:
pl.enable_string_cache()

# Functions

In [None]:
def concat_glob(filename):
    return pl.concat([pl.scan_ipc(f) for f in glob(filename)], how="diagonal")

In [None]:
def label_columns(cols, func=None):
    expr = None
    for col in cols:
        if expr is None:
            expr = pl.when(pl.col(col).is_not_null())
        else:
            expr = expr.when(pl.col(col).is_not_null())
        if func is not None:
            lit = func(col)
        else:
            lit = col
        expr = expr.then(pl.lit(lit))
    return expr

In [None]:
def load_sequencing(filename, filter=True):
    df = concat_glob(filename)
    df = df.with_columns(
        dup=pl.col("name").is_duplicated(),
        primary_alignment=pl.col("name").is_first_distinct(),
        e2e=pl.col("variants_path")
        .list.set_intersection(["<UNS9", ">UNS9", "<UNS3", ">UNS3"])
        .list.len()
        == 2,
        bc_e2e=pl.col("variants_path")
        .list.set_intersection(
            ["<BC:T7_prom", ">BC:T7_prom", "<BC:spacer2", ">BC:spacer2"]
        )
        .list.len()
        == 2,
    )
    if filter:
        df = df.filter(pl.col("primary_alignment"), pl.col("e2e"))
    return df

In [None]:
def path_to_barcode_string(path_col, bits=list(range(30))):
    if isinstance(path_col, str):
        path_col = pl.col(path_col)
    return pl.concat_str(
        [
            pl.when(
                path_col.list.contains(f">BC:bit{bit}=1").or_(
                    path_col.list.contains(f"<BC:bit{bit}=1")
                )
            )
            .then(pl.lit("1"))
            .otherwise(pl.lit("0"))
            for bit in bits
        ]
    )

# Biobear

In [None]:
import biobear

In [None]:
import polars as pl
import pyarrow as pa
import pyarrow.compute as pc

In [None]:
s = biobear.connect()

In [None]:
x = s.read_bam_file(
    "/home/jqs1/scratch/sequencing/230707_repressilators/20230707_2040_MN35044_FAS94231_25542e0d/bam_pass/channel-1_merged.bam"
)

In [None]:
y = x.to_arrow_record_batch_reader()

In [None]:
z = y.read_next_batch()

In [None]:
z.column_names

In [None]:
z["tags"][0][0]

In [None]:
pc.

In [None]:
?pc.struct_field

In [None]:
z["tags"]  # .cast(pa.list_(pa.map_(pa.string(), pa.string())))

In [None]:
z["quality_score"]

In [None]:
z["tags"]

In [None]:
zz = pl.from_arrow(z)

In [None]:
?zz.pivot

In [None]:
zz.columns

In [None]:
zz.schema

In [None]:
zz["quality_score"].estimated_size("mb")

In [None]:
zz["quality_score"].cast(pl.List(pl.UInt8)).estimated_size("mb")

In [None]:
{c: zz[c].estimated_size("mb") for c in zz.columns}

In [None]:
pl.concat(
    [
        zz.select(pl.exclude("name", "tags")),
        zz.explode("tags")
        .unnest("tags")
        .pivot(index="name", columns="tag", values="value"),
    ],
    how="horizontal",
).select(pl.col(zz.columns).exclude("tags"), pl.all().exclude(zz.columns))

In [None]:
%%time
y = pl.from_arrow(x.to_arrow_record_batch_reader())

In [None]:
x.to_polars()

In [None]:
biobear.BamReader?a

In [None]:
!du -hsc /home/jqs1/scratch/sequencing/230707_repressilators/20230707_2040_MN35044_FAS94231_25542e0d/bam_pass/channel-1_merged.bam

In [None]:
session = biobear.BamReader(
    "/home/jqs1/scratch/sequencing/230707_repressilators/20230707_2040_MN35044_FAS94231_25542e0d/bam_pass/channel-1_merged.bam"
)

In [None]:
x = pl.from_arrow(session.to_arrow())

In [None]:
x

In [None]:
x = session.to_polars()

In [None]:
x

# 240703_pLIB476_bottlenecked

In [None]:
%%time
df = load_sequencing(
    "/home/jqs1/scratch/sequencing/240703_pLIB476_bottlenecked/output/max_divergence=0.3/extract_segments/*.arrow"
).collect()

In [None]:
gfa = gfapy.Gfa.from_file(
    "/home/jqs1/scratch/sequencing/240703_pLIB476_bottlenecked/references/pLIB476jqs.gfa"
)

In [None]:
df2 = processing.compute_divergences(
    df,
    list(dict.fromkeys(([s.split("=")[0] for s in gfa.segment_names]))),
    struct_name="extract_segments",
)

In [None]:
df2 = df2.filter(pl.col("grouping_depth") != 1)

In [None]:
df["extract_segments"].struct.fields

In [None]:
df["max_divergence"].value_counts().sort("max_divergence")

In [None]:
plt.hist(df4["max_divergence"], bins=100);

In [None]:
df3.filter(pl.col("grouping_depth") < 10)

In [None]:
df3.sort("grouping_depth").select(
    pl.col("grouping_depth"),
    frac_barcodes=pl.int_range(1, pl.len() + 1, dtype=pl.UInt32) / pl.len(),
    frac_reads=pl.col("grouping_depth").cum_sum() / pl.col("grouping_depth").sum(),
).to_pandas().hvplot.step("grouping_depth", logx=True, logy=False, where="pre")

In [None]:
df4.filter(pl.col("max_divergence") > 0.05).sort("grouping_depth").select(
    pl.col("grouping_depth"),
    frac_barcodes=pl.int_range(1, pl.len() + 1, dtype=pl.UInt32) / pl.len(),
    frac_reads=pl.col("grouping_depth").cum_sum() / pl.col("grouping_depth").sum(),
).to_pandas().hvplot.step("grouping_depth", logx=True, logy=False, where="pre")

In [None]:
%%time
df3.select(processing.categorical_list_hash(pl.col("grouping_path")).unique())

In [None]:
df3["grouping_depth"].value_counts().sort("grouping_depth").with_columns(
    cum=pl.col("count").cum_sum(reverse=True)
).to_pandas().hvplot.step("grouping_depth", "cum", logy=True)

In [None]:
plt.hist(df3["grouping_depth"], bins=1000, log=True);

## Burden

In [None]:
df["SD2_variant|seq"].value_counts(sort=True).with_columns(
    frac=pl.col("count") / pl.col("count").sum(),
).to_pandas()["count"].hvplot.step(logy=True)

In [None]:
df.filter(pl.len().over("SD2_variant|seq") > 1).group_by("SD2_variant|seq").agg(
    "grouping_depth"
).sort(pl.col("grouping_depth").list.mean()).with_columns(
    rbs_id=pl.int_range(pl.len())
).select(
    "rbs_id", "grouping_depth"
).explode(
    "grouping_depth"
).to_pandas().hvplot.scatter(
    "rbs_id", "grouping_depth", alpha=0.2, s=2
)

# Joins

##  240703_pLIB476_bottlenecked with 240612_pLIB476_bottlenecked

In [None]:
%%time
df_240703b = load_sequencing(
    "/home/jqs1/scratch/sequencing/240703_pLIB476_bottlenecked/output/max_divergence=0.3/extract_segments/*.arrow"
).collect()
df_240610b = load_sequencing(
    "/home/jqs1/scratch/sequencing/240610_pLIB476_bottleneck/pLIB476_bottleneck/pLIB476/20240607_1433_MN35044_FAX60316_7d690112/output/max_divergence=0.3/extract_segments/*.arrow"
).collect()

In [None]:
%%time
cols = ["grouping_path", "variants_path", "grouping_depth"]
df_joined = df_240703b.select(cols).join(
    df_240610b.select(cols),
    on=processing.categorical_list_hash(pl.col("grouping_path")),
    how="full",
    suffix="_240610b",
)

In [None]:
df_joined.group_by(
    pl.col("grouping_path").is_not_null(), pl.col("grouping_path_240610b").is_not_null()
).agg(pl.len()).rename(
    {
        "grouping_path": "240703_bottlenecked",
        "grouping_path_240610b": "240612_minion_bottlenecked",
    }
)

In [None]:
df_240703b.sort("grouping_depth").select(
    pl.col("grouping_depth"),
    frac_barcodes=pl.int_range(1, pl.len() + 1, dtype=pl.UInt32) / pl.len(),
    frac_reads=pl.col("grouping_depth").cum_sum() / pl.col("grouping_depth").sum(),
).to_pandas().hvplot.step("grouping_depth", logx=True, logy=False, where="pre")

In [None]:
df_240610b.sort("grouping_depth").select(
    pl.col("grouping_depth"),
    frac_barcodes=pl.int_range(1, pl.len() + 1, dtype=pl.UInt32) / pl.len(),
    frac_reads=pl.col("grouping_depth").cum_sum() / pl.col("grouping_depth").sum(),
).to_pandas().hvplot.step("grouping_depth", logx=True, logy=False, where="pre")

In [None]:
df_joined.select("grouping_depth", "grouping_depth_240610b").to_pandas().hvplot.scatter(
    "grouping_depth", "grouping_depth_240610b"
)

##  240703_pLIB476_bottlenecked with 240510_pLIB476

In [None]:
%%time
df_240703b = load_sequencing(
    "/home/jqs1/scratch/sequencing/240703_pLIB476_bottlenecked/output/max_divergence=0.3/extract_segments/*.arrow"
).collect()
df_240513 = load_sequencing(
    "/home/jqs1/scratch/sequencing/240513_pLIB473_476/20240513_1645_2C_PAW46239_b49d575f/output/max_divergence=0.3/extract_segments/*.arrow"
).collect()

In [None]:
%%time
cols = ["grouping_path", "variants_path", "grouping_depth"]
df_joined = df_240703b.select(cols).join(
    df_240610b.select(cols),
    on=processing.categorical_list_hash(pl.col("grouping_path")),
    how="full",
    suffix="_240610b",
)

In [None]:
df_joined.group_by(
    pl.col("grouping_path").is_not_null(), pl.col("grouping_path_240610b").is_not_null()
).agg(pl.len()).rename(
    {
        "grouping_path": "240703_bottlenecked",
        "grouping_path_240610b": "240612_minion_bottlenecked",
    }
)

## J23100

In [None]:
align.pairwise_align(
    "GGAGTTTACGGCTAGCTCAGTCCTAGGTACAGTGCTAGC", "GGAGTTGACGGCTAGCTCAGTCCTAGGTACAGTGCTAGC"
)

In [None]:
x = df2.filter(pl.col("promoter|seq") == "GGAGTTTACGGCTAGCTCAGTCCTAGGTACAGTGCTAGC")

In [None]:
x[0, "cg"]

In [None]:
x[0, "realign_cg"]

In [None]:
idx = 0
path = x[idx, "variants_path"]
seq = x[idx, "consensus_seq"]

In [None]:
name_to_seq = sgfa.gfa_name_mapping(gfa)
ref_seq = sgfa.assemble_seq_from_path(name_to_seq, path)

In [None]:
seq

In [None]:
processing.pairwise_align(
    seq,
    ref_seq,
    **{"cigar_as_string": True, "method": "parasail", "degenerate": True},
)

In [None]:
processing.pairwise_align(
    seq,
    ref_seq,
    **{"cigar_as_string": True, "method": "parasail", "degenerate": False},
)

In [None]:
x.columns

In [None]:
y = processing.cut_cigar_df(
    x,
    gfa,
    path_column="variants_path",
    cigar_column="cg",
    sequence_column="consensus_seq",
    # phred_column="consensus_phred",
    query_start_column="query_start",
    query_end_column="query_end",
    query_length_column="query_length",
    path_start_column="path_start",
    path_end_column="path_end",
    struct_name="extract_segments2",
    keep_full=True,
)

In [None]:
y2 = processing.cut_cigar_df(
    x,
    gfa,
    path_column="variants_path",
    cigar_column="realign_cg",
    sequence_column="consensus_seq",
    # phred_column="consensus_phred",
    # query_start_column="query_start",
    # query_end_column="query_end",
    # query_length_column="query_length",
    # path_start_column="path_start",
    # path_end_column="path_end",
    struct_name="extract_segments2",
    keep_full=True,
)

In [None]:
plt.hist(
    x.select(pl.col("consensus_seq").str.len_bytes())["consensus_seq"],
    bins=100,
    log=True,
);

In [None]:
len(x[0, "consensus_seq"])

In [None]:
x.select("cg", "realign_cg")

In [None]:
y2[0, "extract_segments2"]

In [None]:
y[0, "extract_segments2"]

In [None]:
x["consensus_seq"]

In [None]:
x["cg"].value_counts(sort=True)

In [None]:
x[2, "cg"]

In [None]:
x[2].select(r"^.*\|cigar$")

In [None]:
df["promoter|variant"].value_counts(sort=True)

In [None]:
df2 = df.unnest("extract_segments")

In [None]:
df2.filter(pl.col("promoter|variant") == "J23100", pl.col("promoter|divergence") == 0)[
    "promoter|seq"
].value_counts(sort=True).to_pandas()

In [None]:
np.asarray(df2["ScmJ|divergence"]) != 0

In [None]:
np.count_nonzero(np.asarray(df2["ScmJ|divergence"]))

In [None]:
np.asarray(df2["ScmJ|divergence"])

In [None]:
np.isnan(np.asarray(df2["ScmJ|divergence"]))

In [None]:
def nonzero_frac(x):
    x = x[~np.isnan(x)]
    return np.count_nonzero(x) / len(x)


pl.DataFrame(
    [
        (c, nonzero_frac(np.asarray(df2[c])))
        for c in df2.columns
        if c.endswith("|divergence")
    ],
    schema=["col", "value"],
).sort("value", descending=True).to_pandas()

In [None]:
df2["grouping_depth"].value_counts(sort=True)

In [None]:
df2["pPhlF|seq"].value_counts(sort=True)

In [None]:
df2["BetI|seq"].value_counts(sort=True)

In [None]:
df2.filter(pl.col("grouping_depth") > 1)["BetI|seq"].value_counts(sort=True)

In [None]:
with pl.Config(tbl_rows=20):
    display(
        df2.filter(pl.col("grouping_depth") > 1)["BetI|mismatches"].value_counts(
            sort=True
        )
    )

In [None]:
seqs = df2["BetI|seq"].value_counts(sort=True)["BetI|seq"]
for idx in range(50):
    print(f">beti{idx}")
    print(seqs[idx])

In [None]:
df2[0, "variants_path"].to_list()

In [None]:
df2.group_by("promoter|variant").agg(
    pl.col("promoter|seq").value_counts(sort=True).struct[1].alias("counts")
).with_columns(
    total=pl.col("counts").list.sum(), correct=pl.col("counts").list.get(0)
).with_columns(
    frac_correct=pl.col("correct") / pl.col("total")
).to_pandas()

In [None]:
df2.group_by("promoter|variant").agg(
    pl.col("CDS_mVenus|seq").value_counts(sort=True).struct[1].alias("counts")
).with_columns(
    total=pl.col("counts").list.sum(), correct=pl.col("counts").list.get(0)
).with_columns(
    frac_correct=pl.col("correct") / pl.col("total")
).to_pandas()

In [None]:
df2["BCD_upstream|divergence"].value_counts(sort=True)

In [None]:
df2["LacI|divergence"].value_counts(sort=True)

In [None]:
df2["PhlF|seq"].value_counts(sort=True)

In [None]:
df2["CDS_mVenus|divergence"].value_counts(sort=True)

In [None]:
df2["Term_DT3|divergence"].value_counts(sort=True)

In [None]:
df2["CDS_mVenus|divergence"].value_counts(sort=True)

In [None]:
df2.filter(pl.col("promoter|variant") == "J23103")["promoter|mismatches"].value_counts(
    sort=True
).to_pandas()

In [None]:
df2["CDS_mVenus|cigar"].value_counts(sort=True)

In [None]:
df2["promoter|divergence"].value_counts(sort=True)

In [None]:
df.filter(pl.col("promoter|variant") == "J23100")["promoter|seq"].value_counts(
    sort=True
).to_pandas()

In [None]:
df.filter(pl.col("promoter|variant") == "J23100")["promoter|cigar"].value_counts(
    sort=True
).to_pandas()

In [None]:
df.filter(pl.col("promoter|variant") == "J23100")["promoter|deletions"].value_counts(
    sort=True
)

In [None]:
df.filter(pl.col("promoter|variant") == "J23100")["promoter|divergence"].value_counts(
    sort=True
)

## Export to Eaton format

In [None]:
%%time
df = load_sequencing(
    # "/home/jqs1/scratch/sequencing/240703_pLIB476_bottlenecked/output/max_divergence=0.3/extract_segments/*.arrow"
    "/home/jqs1/scratch/sequencing/240718_pLIB463/20240718_1344_2B_PAU75540_5e4bf128/output/max_divergence=0.3/extract_segments/*.arrow"
).collect()

In [None]:
gfa = gfapy.Gfa.from_file(
    # "/home/jqs1/scratch/sequencing/240703_pLIB476_bottlenecked/references/pLIB476jqs.gfa"
    "/home/jqs1/scratch/sequencing/240718_pLIB463/20240718_1344_2B_PAU75540_5e4bf128/references/pLIB463.gfa"
)

In [None]:
df = processing.compute_divergences(
    df,
    list(dict.fromkeys(([s.split("=")[0] for s in gfa.segment_names]))),
    struct_name="extract_segments",
)
df2 = df.unnest("extract_segments")

In [None]:
counts = df2.select(
    pl.struct("RBS1:RBS|seq", "RBS2:RBS|seq", "RBS3:RBS|seq").alias("foo")
)["foo"].value_counts(sort=True)

In [None]:
counts.filter(pl.col("count") == 1)

In [None]:
counts.filter(pl.col("count") > 5)

In [None]:
with pl.Config(tbl_rows=100):
    display(counts)

In [None]:
perfect_segments = [
    "pBetI",
    "RBS1:upstream",
    "LacI",
    "pTac",
    "RBS2:upstream",
    "PhlF",
    "pPhlF",
    "RBS3:upstream",
    "BetI",
]

In [None]:
df2.select(pl.col(r"^.*\|divergence$"))

In [None]:
df2.filter(*[pl.col(f"{s}|divergence") == 0 for s in perfect_segments]).select(
    pl.struct("RBS1:RBS|seq", "RBS2:RBS|seq", "RBS3:RBS|seq").alias("foo")
)["foo"].value_counts(sort=True)

In [None]:
filter_columns = [
    "pBetI|divergence",
    "RBS1:upstream|divergence",
    "LacI|mismatches",
    "pTac|divergence",
    "RBS2:upstream|divergence",
    "PhlF|mismatches",
    "pPhlF|divergence",
    "RBS3:upstream|divergence",
    "BetI|mismatches",
]

In [None]:
48_088 / len(df2)

In [None]:
df2.filter(*[pl.col(c) == 0 for c in filter_columns])

In [None]:
%%time
df_eaton = (
    df2.with_columns(
        barcode=path_to_barcode_string("variants_path"),
        reference=pl.lit(""),
        alignmentstart=1,
        cigar=pl.lit(""),
        subsample=pl.lit(""),
    )
    .rename({"consensus_seq": "consensus"})
    .select(
        "barcode",
        "consensus",
        "reference",
        "alignmentstart",
        "cigar",
        "subsample",
        "grouping_depth",
        "RBS1:RBS|seq",
        "RBS2:RBS|seq",
        "RBS3:RBS|seq",
        "pBetI|divergence",
        "RBS1:upstream|divergence",
        "LacI|mismatches",
        "pTac|divergence",
        "RBS2:upstream|divergence",
        "PhlF|mismatches",
        "pPhlF|divergence",
        "RBS3:upstream|divergence",
        "BetI|mismatches",
    )
    .sort("barcode")
    .with_row_index(name="barcodeid")
    .with_row_index(name="")
)

In [None]:
df_eaton.write_csv("240718_pLIB463_eaton_export.tsv", separator="\t")

In [None]:
!pwd

# 240612_pLIB476_isolates

## prepare_reads

In [None]:
%%time
arrow_filename = "/home/jqs1/scratch/sequencing/240612_pLIB476_isolates/output/vg/prepare_reads/*.arrow"
df = concat_glob(arrow_filename).collect()

In [None]:
%%time
df2 = df.with_columns(
    dup=pl.col("name").is_duplicated(),
    primary_alignment=pl.col("name").is_first_distinct(),
    e2e=pl.col("full_path")
    .list.set_intersection(["<UNS9", ">UNS9", "<UNS3", ">UNS3"])
    .list.len()
    == 2,
    bc_e2e=pl.col("full_path")
    .list.set_intersection(["<BC:T7_prom", ">BC:T7_prom", "<BC:spacer2", ">BC:spacer2"])
    .list.len()
    == 2,
)

In [None]:
df3 = (
    df2.filter(pl.col("bc_e2e"), pl.col("primary_alignment"))
    .unnest("extract_segments")
    .with_columns(
        sample=pl.col("name").str.split_exact("_", 2).struct[2].cast(pl.Int32)
    )
)

In [None]:
df4 = (
    df3.group_by("sample")
    .agg(
        *[
            pl.col(f"BC:bit{bit}|{type_}").mean()
            for bit in range(30)
            for type_ in ("mismatches", "insertions", "deletions")
        ]
    )
    .sort("sample")
)

In [None]:
df4

In [None]:
df4.filter(pl.col("sample") == 1)

In [None]:
len(mismatches)

In [None]:
bits = np.arange(30)
for row in df4.to_dicts():
    mismatches = [row[f"BC:bit{bit}|mismatches"] for bit in bits]
    insertions = [row[f"BC:bit{bit}|insertions"] for bit in bits]
    deletions = [row[f"BC:bit{bit}|deletions"] for bit in bits]
    plt.figure(figsize=(10, 3))
    plt.stackplot(
        [-0.5, *(bits + 0.5)],
        [*mismatches, 0],
        [*insertions, 0],
        [*deletions, 0],
        labels=["mismatches", "insertions", "deletions"],
        step="post",
    )
    plt.ylim([0, 6])
    plt.title(f"sample {row['sample']}")
    plt.xticks(bits)
    plt.legend();

In [None]:
sample = 4
bit = 17
bins = np.arange(10)
bin_centers = (bins[:-1] + bins[1:]) / 2
hists = {
    type_: np.histogram(
        df3.filter(pl.col("sample") == sample)[f"BC:bit{bit}|{type_}"], bins=bins
    )[0]
    for type_ in ("mismatches", "insertions", "deletions")
}
plt.figure(figsize=(6, 3))
plt.stackplot(
    bin_centers - 1,
    hists["mismatches"],
    hists["insertions"],
    hists["deletions"],
    labels=["mismatches", "insertions", "deletions"],
    step="post",
)
plt.title(f"sample {sample} bit {bit}")
plt.legend();

## extract_segments

In [None]:
%%time
arrow_filename = "/home/jqs1/scratch/sequencing/240612_pLIB476_isolates/output/primary_max_divergence=1/extract_segments/*.arrow"
df = concat_glob(arrow_filename).collect()

In [None]:
df2 = (
    df.filter(pl.col("name").is_first_distinct())
    .with_columns(barcode_str=path_to_barcode_string("variants_path"))
    .sort("barcode_str")
    .to_pandas()
)

In [None]:
df2[["barcode_str", "name", "grouping_depth"]]

In [None]:
df2[["barcode_str", "name", "grouping_depth"]]

In [None]:
df2.iloc[-1].loc["variants_path"]

In [None]:
df2.iloc[-2].loc["variants_path"]

In [None]:
%%time
arrow_filename = "/home/jqs1/scratch/sequencing/240612_pLIB476_isolates/output/primary_max_divergence=0.1/extract_segments/*.arrow"
df = concat_glob(arrow_filename).collect()

In [None]:
len(df.filter(pl.col("name").is_first_distinct()))

In [None]:
df.filter(pl.col("name").is_first_distinct())["grouping_depth"].to_pandas()

In [None]:
df2 = processing.compute_divergences(
    df, processing.unique_segments(df, "path"), struct_name="extract_segments"
)
df2 = df2.with_columns(
    sample=pl.col("name").str.split_exact("_", 2).struct[2].cast(pl.Int32)
)

In [None]:
df2.filter(pl.col("sample") == 11).sort("max_divergence")

In [None]:
plt.hist(df2.filter(pl.col("sample") == 11)["max_divergence"], bins=100);

In [None]:
df.head(100)

## Closest distance histogram

In [None]:
from sklearn.neighbors import KDTree

In [None]:
%%time
arrow_filename = "/home/jqs1/scratch/sequencing/240610_pLIB476_bottleneck/pLIB476_bottleneck/pLIB476/20240607_1433_MN35044_FAX60316_7d690112/output/max_divergence=0.05/extract_segments/*.arrow"
# arrow_filename = "/home/jqs1/scratch/sequencing/240612_pLIB476_isolates/output/primary_max_divergence=1/extract_segments/*.arrow"
df = concat_glob(arrow_filename).collect()

In [None]:
%%time
df2 = df.with_columns(
    dup=pl.col("name").is_duplicated(),
    is_primary_alignment=pl.col("name").is_first_distinct(),
    e2e=pl.col("variants_path")
    .list.set_intersection(["<UNS9", ">UNS9", "<UNS3", ">UNS3"])
    .list.len()
    == 2,
    bc_e2e=pl.col("variants_path")
    .list.set_intersection(["<BC:T7_prom", ">BC:T7_prom", "<BC:spacer2", ">BC:spacer2"])
    .list.len()
    == 2,
)

In [None]:
%%time
df3 = df2.filter(
    pl.col("is_primary_alignment"), pl.col("e2e"), pl.col("consensus_depth") >= 0
).with_columns(
    barcode2=pl.concat_list([f"BC:bit{idx}|variant" for idx in range(30)]).cast(
        pl.Array(pl.Boolean, 30)
    ),
    grouping_barcode=path_to_barcode_array("grouping_path"),
    variants_barcode=path_to_barcode_array("variants_path"),
)

In [None]:
len(df3)

In [None]:
grouping_barcodes = df3["grouping_barcode"].to_numpy().astype(np.int16)
variants_barcodes = df3["variants_barcode"].to_numpy().astype(np.int16)
barcodes2 = df3["barcode2"].to_numpy().astype(np.int16)

In [None]:
(grouping_barcodes != variants_barcodes).any(axis=1).sum()

In [None]:
grouping_barcodes[:10]

In [None]:
variants_barcodes[:10]

In [None]:
x = df3.filter(pl.col("barcode") == list(barcodes[2]))
x

In [None]:
x[0, "grouping_path"].to_list()

In [None]:
x[0, "variants_path"].to_list()

In [None]:
x[0, "grouping_path"]

In [None]:
df3.select(pl.col("barcode").is_duplicated().sum())

In [None]:
names = df3["name"]

In [None]:
names[9553]

In [None]:
df3[1152, "barcode"].to_numpy()

In [None]:
df3[9553, "barcode"].to_numpy()

In [None]:
barcodes.shape

In [None]:
print("\n".join(df3[1152, "variants_path"].to_list()))

In [None]:
%%time
kd = KDTree(barcodes, leaf_size=40, metric="l1")
dists, closest = kd.query(barcodes, k=2)
dists = dists[:, 1].astype(np.int16)

In [None]:
np.where(dists == 0)

In [None]:
bc_dups = barcodes[dists == 0]

In [None]:
(bc_dups * (2 ** np.arange(30))[np.newaxis, :]).sum(axis=1)

In [None]:
counts = np.bincount(dists)
counts

In [None]:
plt.step(np.arange(len(counts)), counts, where="mid");

In [None]:
counts

In [None]:
plt.step(np.arange(len(counts)), counts, where="mid");

# 240610_pLIB476 vs. 240510_pLIB473-476

In [None]:
# arrow_filename = "/home/jqs1/scratch/sequencing/240610_pLIB476_bottleneck/pLIB476_bottleneck/pLIB476/20240607_1433_MN35044_FAX60316_7d690112/"
arrow_filename = "/home/jqs1/scratch/sequencing/240513_pLIB473_476/20240513_1645_2C_PAW46239_b49d575f/prepare_reads.all_segments/*.arrow"
df = concat_glob(arrow_filename)  # .collect()

In [None]:
gfa_filename = "/home/jqs1/scratch/sequencing/240610_pLIB476_bottleneck/pLIB476_bottleneck/pLIB476/20240607_1433_MN35044_FAX60316_7d690112/references/pLIB476jqs.gfa"
gfa = gfapy.Gfa.from_file(gfa_filename)

In [None]:
df.schema

In [None]:
gfa.segment_names

In [None]:
df2 = df.filter(pl.col("end_to_end")).head(100).collect()

In [None]:
df2.with_columns(
    barcode=pl.concat_list(
        [
            pl.col("extract_segments").struct.field(f"BC:bit{idx}|variant")
            for idx in range(30)
        ]
    )
)["barcode"]

In [None]:
df3[22, "full_path"].to_list()

In [None]:
df.schema["extract_segments"].fields

In [None]:
# TODO: use exclude after release including https://github.com/pola-rs/polars/issues/16661
seg_col = pl.col("extract_columns").struct.field
df2 = (
    df.head(10)
    .with_columns(
        divergence=pl.sum_horizontal(
            seg_col(r"\|(mismatches|insertions|deletions)").exclude(
                r"upstream\|(mismatches|insertions|deletions)",
                r"downstream\|(mismatches|insertions|deletions)",
            )
        )
    )
    .collect()
)

In [None]:
seg_col = pl.col("extract_columns").struct.field
df2 = (
    df.head(10)
    .with_columns(
        divergence=pl.sum_horizontal(
            [
                seg_col(f"{s[1:]}|{type_}").fill_null(strategy="zero")
                for type_ in ("matches", "mismatches", "insertions", "deletions")
                for s in forward_segments
            ]
        )
    )
    .collect()
)

In [None]:
df2["divergence"]

In [None]:
df2 = df.filter(pl.col("end_to_end")).with_columns(
    barcode=pl.concat_list(
        [
            pl.col("extract_segments").struct.field(f"BC:bit{idx}|variant")
            for idx in range(30)
        ]
    )
)

In [None]:
df3 = df2.head(100).collect()

In [None]:
%%time
df4 = df2.filter(pl.col("barcode") == df3[22, "barcode"].to_list()).collect()

In [None]:
df4