In [None]:
import itertools as it
import operator
from collections import Counter, defaultdict
from functools import partial
from glob import glob
from pathlib import Path

import gfapy
import holoviews as hv
import hvplot.pandas
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.dataset as ds
from tqdm.auto import tqdm, trange

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import paulssonlab.sequencing.align as align
import paulssonlab.sequencing.cigar as scigar
import paulssonlab.sequencing.consensus as con
import paulssonlab.sequencing.gfa as sgfa
import paulssonlab.sequencing.io as sio
import paulssonlab.sequencing.processing as processing
from paulssonlab.util.sequence import reverse_complement

In [None]:
hv.extension("matplotlib")

In [None]:
pl.enable_string_cache()

# Functions

In [None]:
def concat_glob(filename):
    return pl.concat([pl.scan_ipc(f) for f in glob(filename)], how="diagonal")

In [None]:
def load_sequencing(filename, filter=True):
    df = concat_glob(filename)
    if "is_primary_alignment" not in df.columns:
        df = df.with_columns(is_primary_alignment=pl.col("name").is_first_distinct())
    df = df.with_columns(
        dup=pl.col("name").is_duplicated(),
        e2e=pl.col("variants_path")
        .list.set_intersection(["<UNS9", ">UNS9", "<UNS3", ">UNS3"])
        .list.len()
        == 2,
        bc_e2e=pl.col("variants_path")
        .list.set_intersection(
            ["<BC:T7_prom", ">BC:T7_prom", "<BC:spacer2", ">BC:spacer2"]
        )
        .list.len()
        == 2,
    )
    if filter:
        df = df.filter(pl.col("is_primary_alignment"), pl.col("e2e"))
    return df

In [None]:
def path_to_barcode_string(path_col, bits=list(range(30))):
    if isinstance(path_col, str):
        path_col = pl.col(path_col)
    return pl.concat_str(
        [
            pl.when(
                path_col.list.contains(f">BC:bit{bit}=1").or_(
                    path_col.list.contains(f"<BC:bit{bit}=1")
                )
            )
            .then(pl.lit("1"))
            .otherwise(pl.lit("0"))
            for bit in bits
        ]
    )

# Debug null paths

In [None]:
df = pl.scan_ipc(
    "/home/jqs1/scratch/sequencing/241007_pLIB502-503/prepare_consensus/ignore/*.arrow"
)  # .select("path", "read_seq")

In [None]:
df.columns

In [None]:
df.select(pl.col("is_primary_alignment").value_counts()).collect()

In [None]:
%%time
df.filter(pl.col("path").is_null(), pl.col("is_primary_alignment")).select(
    pl.len(), pl.col("read_seq").str.len_bytes().mean()
).collect()

In [None]:
%%time
x = (
    df.filter(pl.col("path").is_null())
    .head(10_000)
    .select(pl.col("read_seq").str.len_bytes())
    .collect()["read_seq"]
)

In [None]:
%%time
y = (
    df.filter(pl.col("path").is_null(), pl.col("read_seq").str.len_bytes() > 5000)
    .head(100)
    .collect()
)

In [None]:
s = y[45, "read_seq"]
print(len(s))
print(s)

In [None]:
plt.hist(x, bins=100);

In [None]:
df = pl.scan_ipc(
    "/home/jqs1/scratch/sequencing/241007_pLIB502-503/prepare_consensus/*.arrow"
)

In [None]:
%%time
df.select(pl.len()).collect()

In [None]:
%%time
df.head(100_000).group_by(pl.col("path").is_null()).agg(
    pl.len(), pl.col("read_seq").str.len_bytes().mean()
).collect()

# Load

In [None]:
segment_columns = [
    "sigma:promoter|variant",
    "sigma:promoter|divergence",
    "antisigma:promoter|variant",
    "antisigma:promoter|divergence",
    "reporter:promoter|variant",
    "reporter:promoter|divergence",
    "sigma:RBS:RiboJ|divergence",
    "sigma:RBS:BCD_leader|divergence",
    "antisigma:RBS:RiboJ|divergence",
    "antisigma:RBS:BCD_leader|divergence",
    "reporter:RBS:RiboJ|divergence",
    "reporter:RBS:BCD_leader|divergence",
    "sigma:RBS|seq",
    "antisigma:RBS|seq",
    "reporter:RBS|seq",
]

In [None]:
gfa = gfapy.Gfa.from_file(
    "/home/jqs1/scratch/sequencing/sequencing_references/pLIB502-503.gfa"
)

In [None]:
%%time
df = load_sequencing(
    "/home/jqs1/scratch/sequencing/241007_pLIB502-503/output/max_divergence=0.3/extract_segments/*.arrow"
)
df = processing.compute_divergences(
    df,
    list(dict.fromkeys(([s.split("=")[0] for s in gfa.segment_names]))),
    struct_name="variants_segments",
)
df = df.select(
    pl.col(
        "grouping_path",
        # "consensus_seq",
        "name",
        "grouping_path_hash",
        "grouping_depth",
        "consensus_depth",
        "strand",
        "variants_path",
        # "variants_segments",
        "is_primary_alignment",
        "dup",
        "e2e",
        "bc_e2e",
    ),
    *[pl.col("variants_segments").struct[f] for f in segment_columns]
)
df = df.collect()

In [None]:
df.estimated_size("gb")

In [None]:
df.columns

# Diagnostics

In [None]:
sorted(
    {col: df[col].estimated_size("gb") for col in df.columns}.items(),
    key=operator.itemgetter(1),
    reverse=True,
)

In [None]:
sorted(
    {
        col: df["variants_segments"].struct[col].estimated_size("gb")
        for col in df["variants_segments"].struct.fields
    }.items(),
    key=operator.itemgetter(1),
    reverse=True,
)

In [None]:
df["sigma:promoter|variant"].value_counts()

In [None]:
df["antisigma:promoter|variant"].value_counts()

In [None]:
df.group_by(pl.col("sigma:promoter|variant", "antisigma:promoter|variant")).agg(
    pl.len()
).sort("len")

In [None]:
df["sigma:promoter|variant"].value_counts()

In [None]:
df["repressor:RBS|variant"].value_counts()

In [None]:
df.select(
    pl.struct(
        "promoter|variant", "repressor|variant", "repressor:RBS|variant"
    ).value_counts(sort=True)
).unnest("promoter|variant")

In [None]:
hv.Curve(
    df.select(
        pl.struct(
            "promoter|variant", "repressor|variant", "repressor:RBS|variant"
        ).value_counts(sort=True)
    ).unnest("promoter|variant")["count"]
)

In [None]:
df.select(
    pl.col(r"^.*\|divergence$").replace(np.nan, None),
).quantile(
    0.99
).to_pandas().T.sort_values(0, ascending=False)

In [None]:
df["grouping_depth"].sort(descending=True).to_pandas().hvplot.step(
    logy=True, height=800
)

In [None]:
df.sort("grouping_depth").select(
    pl.col("grouping_depth"),
    frac_barcodes=pl.int_range(1, pl.len() + 1, dtype=pl.UInt32) / pl.len(),
    frac_reads=pl.col("grouping_depth").cum_sum() / pl.col("grouping_depth").sum(),
).to_pandas().hvplot.step("grouping_depth", logx=True, logy=False, where="pre")

In [None]:
counts = df.select(
    pl.struct(
        "sigma:promoter|variant",
        "antisigma:promoter|variant",
        "sigma:RBS|seq",
        "antisigma:RBS|seq",
        "reporter:RBS|seq",
    ).alias("foo")
)["foo"].value_counts(sort=True)

In [None]:
counts["count"].to_pandas().hvplot.step(logy=True, logx=True)

# Export to Eaton format

In [None]:
%%time
df_eaton = (
    df.with_columns(
        barcode=path_to_barcode_string("variants_path"),
        reference=pl.lit(""),
        alignmentstart=1,
        cigar=pl.lit(""),
        subsample=pl.lit(""),
        consensus_seq=pl.lit(""),  # if not including consensus seq
    )
    .rename({"consensus_seq": "consensus"})
    # .select(
    #     "barcode",
    #     "consensus",
    #     "reference",
    #     "alignmentstart",
    #     "cigar",
    #     "subsample",
    #     "grouping_depth",
    #     "consensus_depth",
    #     "promoter|variant",
    #     "promoter|seq",
    #     "RBS|seq",
    #     "repressor:promoter|variant",
    #     "repressor:promoter|seq",
    #     "repressor|variant",
    #     "promoter|divergence",
    #     "ScmJ|divergence",
    #     "RBS:RiboJ|divergence",
    #     "RBS:BCD_leader|divergence",
    #     "repressor:promoter|divergence",
    #     "repressor|divergence",
    #     "mVenus|divergence",
    # )
    .sort("barcode")
    .with_row_index(name="barcodeid")
    .with_row_index(name="")
)

In [None]:
df_eaton

In [None]:
df_eaton.write_parquet("241016_pLIB502-503_eaton_export.parquet")

In [None]:
!pwd