In [None]:
import itertools as it
import operator
import re
from collections import Counter
from pathlib import Path

import awkward as ak
import bottleneck as bn
import duckdb
import gfapy
import holoviews as hv
import ibis
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
import polars as pl
import pyabpoa
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.dataset as ds
import pyfastx
import pysam
import spoa
from pyarrow import csv
from tqdm.auto import tqdm, trange

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import paulssonlab.sequencing.consensus as con
import paulssonlab.sequencing.io as sio
from paulssonlab.util.sequence import reverse_complement

In [None]:
hv.extension("bokeh")

In [None]:
%load_ext pyinstrument
import line_profiler
import pyinstrument

%load_ext line_profiler

# Consensus

In [None]:
test_groups = pl.read_ipc(
    "/home/jqs1/scratch/jqs1/sequencing/scratch/test_read_groups_100.arrow"
)

In [None]:
group_depths = (
    test_groups.select(pl.col("name"))
    .with_columns(pl.col("name").list.len())
    .filter(pl.col("name") > 1)
    .sort("name", descending=True)
    .to_numpy()
)

In [None]:
plt.plot(group_depths[:, 0])

In [None]:
%%time
test_groups_subset = test_groups.filter(pl.col("name").list.len().is_between(100, 120))

In [None]:
test_groups_subset.select("depth", "duplex_depth")

In [None]:
test_reads = (
    test_groups_subset[0]
    .select(
        pl.col("name", "read_seq", "read_phred", "reverse_complement").list.explode(),
        pl.col("duplex_depth", "simplex_depth"),
    )
    .with_columns(
        pl.col("read_seq").str.len_bytes().alias("read_len"),
        pl.col("name").str.contains(";").alias("is_duplex"),
        pl.col("name").str.contains(";").not_().alias("is_simplex"),
    )
    .sort("is_duplex", descending=True)
)
test_reads

In [None]:
%%time
read_seq = test_reads.get_column("read_seq").to_list()
read_rc = ak.from_arrow(test_reads.get_column("reverse_complement").to_arrow())
read_phred = test_reads.get_column("read_phred").to_arrow()
read_seq_oriented = [
    reverse_complement(seq) if rc else seq for seq, rc in zip(read_seq, read_rc)
]
read_phred_oriented = ak.from_arrow(
    pa.array(
        [
            phred.values.to_numpy()[::-1] if rc else phred.values.to_numpy()
            for phred, rc in zip(read_phred, read_rc)
        ]
    )
)

In [None]:
%%time
seqs, phreds = con.prepare_reads(
    test_reads.get_column("read_seq").to_list(),
    test_reads.get_column("reverse_complement").to_arrow(),
    test_reads.get_column("read_phred").to_arrow(),
)

In [None]:
%%time
aligned_phreds = con.align_phreds(msa_seq, read_phred_oriented)

In [None]:
%%time
msa_seqs = con.msa(seqs, method="abpoa")

In [None]:
con.print_msa(msa_seqs, phreds)

In [None]:
%%time
(
    consensus,
    consensus_phred,
    nonconsensus,
    nonconsensus_phred,
) = con.phred_weighted_consensus(msa_seq, read_phred_oriented)

## Numba

In [None]:
%%time
(
    consensus,
    consensus_phred,
    conconsensus,
    nonconsensus_phred,
) = con.phred_weighted_consensus(msa_seq, read_phred_oriented)

In [None]:
consensus

In [None]:
consensus.tobytes().decode()  # .replace("-", "")

In [None]:
len(read_phred_oriented)

In [None]:
len(msa_seq)