In [None]:
import itertools as it
import operator
import re
from collections import Counter
from pathlib import Path

import holoviews as hv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.compute as pc
import pysam
from pyarrow import csv
from tqdm.auto import tqdm, trange

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import paulssonlab.sequencing.io as sio

In [None]:
hv.extension("bokeh")

In [None]:
import pyinstrument

%load_ext pyinstrument
import line_profiler

%load_ext line_profiler

# Config

In [None]:
data_dir = Path(
    "/home/jqs1/scratch/jqs1/sequencing/230930_alignment_test/230707_repressilators/"
)

# GAF to Parquet

In [None]:
gaf_filenames = list(data_dir.glob("*_barcodeonly.gaf"))

In [None]:
gaf_filenames

In [None]:
gaf_filename = gaf_filenames[0]

In [None]:
gaf_filename.lstat().st_size / 1e6

In [None]:
%%time
gaf_table = pa.Table.from_batches(tqdm(sio.iter_gaf(gaf_filename)))

In [None]:
gaf_table.nbytes / 1e6

In [None]:
{n: gaf_table.column(n).nbytes // 1e6 for n in gaf_table.column_names}

# GAF/BAM to Parquet

In [None]:
bam_filename = str(gaf_filename).replace("_barcodeonly.gaf", ".bam")

In [None]:
bam = pysam.AlignmentFile(bam_filename, check_sq=False)

In [None]:
read = next(bam.fetch(until_eof=True))
read.query_name

In [None]:
bam.reset()

In [None]:
bam_index = pysam.IndexedReads(bam)

In [None]:
%%time
bam.reset()
bam_index.build()

In [None]:
batch = next(sio.iter_bam_and_gaf(bam_filename, gaf_filename))

In [None]:
{k: getattr(read, k) for k in dir(read) if not k.startswith("_")}

In [None]:
read.query_sequence

In [None]:
import array
import random

In [None]:
x = array.array("B", [random.randint(0, 255) for i in range(50000)])

In [None]:
%timeit pa.array(x, pa.uint8())

In [None]:
%timeit pa.array(np.asarray(x), pa.uint8())

In [None]:
read.query_qualities

In [None]:
pa.array([read.query_qualities[:3], read.query_qualities[:5]], pa.list_(pa.uint8()))

## Benchmarking

In [None]:
%pyinstrument list(it.islice(sio.iter_bam_and_gaf(bam, gaf_filename, bam_index=bam_index), 10))[:0]

In [None]:
%lprun -f sio.iter_bam_and_gaf list(it.islice(sio.iter_bam_and_gaf(bam, gaf_filename, bam_index=bam_index), 10))[:0]

In [None]:
x = read.query_qualities

In [None]:
np.array(x)

In [None]:
%lprun -f sio.iter_bam_and_gaf list(sio.iter_bam_and_gaf(bam_filename, gaf_filename))[:0]

In [None]:
%pyinstrument list(sio.iter_bam_and_gaf(bam_filename, gaf_filename))[:0]

In [None]:
%%time
table = pa.Table.from_batches(tqdm(sio.iter_bam_and_gaf(bam_filename, gaf_filename)))

In [None]:
%%time
table2 = pa.Table.from_batches(
    tqdm(sio.iter_bam_and_gaf(bam, gaf_filename, bam_index=bam_index))
)

In [None]:
x = list(
    tqdm(it.islice(sio.iter_bam_and_gaf(bam, gaf_filename, bam_index=bam_index), 2))
)

In [None]:
x[0].column("name")

In [None]:
len(x[0].column("name"))

In [None]:
len(x[1].column("name"))

In [None]:
table.nbytes / 1e6

In [None]:
bam_filename

In [None]:
len(table2)

In [None]:
z = table2.column("name").value_counts()

In [None]:
zz = z.sort("descending", "counts")

In [None]:
zzz = pc.filter(zz.field("values"), pc.not_equal(zz.field("counts"), 1))

In [None]:
zzz

In [None]:
pc.index_in(zzz, table2.column("name"))

In [None]:
zz.field("values")[zz.field("counts") != 1]

In [None]:
len(table2) - len(pc.unique(table2.column("name")))

In [None]:
%%time
len(pc.unique(table2.column("name")))

In [None]:
%%time
len(set(table.column("name")))

In [None]:
%%time
bam.reset()
reads = 0
for read in bam.fetch(until_eof=True):
    reads += 1

In [None]:
%%time
bam.reset()
reads = set(read.query_name for read in tqdm(bam.fetch(until_eof=True)))

In [None]:
%%time
bam.reset()
reads = 0
names = []
for read in it.islice(bam.fetch(until_eof=True), 10):
    names.append(read.query_name)
    reads += 1
for read in it.islice(bam.fetch(until_eof=True), 10):
    names.append(read.query_name)
    reads += 1

In [None]:
names

In [None]:
len(reads)

In [None]:
len(table)