# Imports

In [None]:
import asyncio
import time
import traceback
import warnings
from collections import defaultdict, namedtuple
from collections.abc import Mapping, Sequence
from functools import partial
from glob import glob
from importlib import reload
from numbers import Number
from operator import getitem

import cachetools
import dask
import distributed
import holoviews as hv
import hvplot.pandas
import ipywidgets as widgets
import matplotlib.pyplot as plt
import nd2reader
import numpy as np
import pandas as pd
import param
import parambokeh
import pyarrow as pa
import pyarrow.feather as feather
import pyarrow.parquet as pq
import qgrid
import scipy
import skimage.morphology
import streamz
import streamz.dataframe as sdf
import zarr
from bokeh.models.tools import HoverTool, TapTool
from cytoolz import *
from dask import delayed
from dask_jobqueue import SLURMCluster
from distributed import Client, LocalCluster, progress
from holoviews.operation.datashader import regrid
from holoviews.streams import Selection1D, Stream, param
from tqdm import tnrange, tqdm, tqdm_notebook
from traitlets import All

IDX = pd.IndexSlice

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# from processing import *
# from trench_detection import *
# from trench_segmentation import *
# from trench_segmentation.watershed import *
# from util import *
# from ui import *
import common
import data_io
import diagnostics
import geometry
import image
import metadata
import trench_detection
import trench_detection.core
import trench_detection.hough
import trench_segmentation.watershed
import ui
import util
import workflow

In [None]:
# %load_ext line_profiler
hv.extension("bokeh")
%matplotlib inline
tqdm.monitor_interval = 0
asyncio.get_event_loop().set_debug(True)
import logging

logging.basicConfig(level=logging.DEBUG)

# Analysis

In [None]:
# cols = ['position', 'label', 'filename', 't', 'trench', 'trench_set', 'channel']
# cols = ["('YFP', 'labelwise', 'p0.9')"]
cols = [
    "filename",
    "position",
    "channel",
    "t",
    "trench_set",
    "trench",
    "label",
    "('YFP', 'labelwise', 'p0.9')",
    "('MCHERRY', 'labelwise', 'p0.9')",
    "('YFP', 'regionprops', 'area')",
]

## Sorting Arrow

In [None]:
import io

In [None]:
io.BufferedReader(, io.DEFAULT_BUFFER_SIZE*1000)

In [None]:
?pq.ParquetWriter.write_table

In [None]:
%%time
data_io.sort_arrow_to_parquet(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.arrow",
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.sorted3.parquet4",
    length=None,
)

In [None]:
%%time
t = data_io.read_parquet(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.sorted3.parquet4",
    columns=cols,
)  # .to_pandas()

In [None]:
%%time
tp3 = t.to_pandas(use_threads=True)

In [None]:
tp3.index.is_lexsorted()

In [None]:
%%time
tp2 = t.replace_schema_metadata().to_pandas()

In [None]:
%%time
tp22 = tp2.set_index(["filename", "position", "t", "trench_set", "trench", "label"])

In [None]:
tp2.info(memory_usage="deep")

In [None]:
f = pa.OSFile("/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.arrow")

In [None]:
s = pa.RecordBatchStreamReader(f)

In [None]:
b = s.read_next_batch()

In [None]:
import json

In [None]:
index_columns = json.loads(b.schema.metadata[b"pandas"])["index_columns"]

In [None]:
index_columns

In [None]:
data_io.first_index(b)

In [None]:
b.column(b.schema.get_field_index("filename"))[0]

## Parquet

In [None]:
f = pq.ParquetFile(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.parquet4"
)

In [None]:
t = f.read_row_group(0)

In [None]:
t.schema.field_by_name("filename").type == pa.string()

In [None]:
f.metadata.num_rows

In [None]:
%%time
parquet_filename = (
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.parquet4"
)
# parquet_filename = '/tmp/analysis_full_stream11_2.parquet4'
# parquet_filename = io.BufferedReader(pa.OSFile(parquet_filename), buffer_size=io.DEFAULT_BUFFER_SIZE * 1000)
t = data_io.read_parquet(parquet_filename, columns=cols)

In [None]:
%%time
t.to_pandas()

In [None]:
%%time
imos = pa.BufferOutputStream()
in_file = pa.OSFile(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.parquet4"
)
imos.upload(in_file)
# tt = pq.read_pandas(imos.getvalue())#.to_pandas()

In [None]:
%%time
t4 = io.read_parquet(
    imos.getvalue(), columns=cols, categories=["filename"], length=None
)

In [None]:
%%time
t3.to_pandas().info()

In [None]:
%%time
t3.replace_schema_metadata().to_pandas().info()

In [None]:
%prun t3.to_pandas()

In [None]:
%prun t3.replace_schema_metadata().to_pandas()

In [None]:
k = t3.replace_schema_metadata().to_pandas()

In [None]:
%prun k.set_index('filename')

In [None]:
%%time
tp = t3.to_pandas()

In [None]:
%%time
imos = pa.BufferOutputStream()
in_file = pa.OSFile(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.nofilename.parquet4"
)
imos.upload(in_file)
tt = pq.read_pandas(imos.getvalue())  # .to_pandas()

In [None]:
tt2 = tt.replace_schema_metadata()

In [None]:
%%time
tp2 = tt2.to_pandas()

In [None]:
%%time
tp.info(memory_usage="deep")

In [None]:
reader = pa.open_stream(arrow_filename)
table0 = pa.Table.from_batches([reader.read_next_batch()])
imos = pa.BufferOutputStream()
buffer = imos.getvalue()
writer = pa.RecordBatchStreamWriter(buffer, table0)

In [None]:
def read_arrow(arrow_filename, columns, categorical_columns=None, batch_size=1000):
    reader = pa.open_stream(arrow_filename)
    table0 = pa.Table.from_batches([reader.read_next_batch()])
    columns_to_drop = list(set(c.name for c in table0.columns) - set(columns))
    table1 = table0.drop(columns_to_drop)
    imos = pa.BufferOutputStream()
    writer = pa.RecordBatchStreamWriter(imos, table1.schema)
    t0 = time.time()
    for i, batches in enumerate(util.grouper(reader, batch_size)):
        if True:  # i % 100 == 0:
            t = time.time()
            dt = t - t0
            t0 = t
            print("batch", i, "time {:.2f}".format(dt))
        table = pa.Table.from_batches(batches).drop(columns_to_drop)
        for i in range(table.num_columns):
            if table.column(i).name == "filename":
                table = table.set_column(i, table.column(i).dictionary_encode())
        print("    rows per second", len(table) / dt)
        writer.write_table(table)
    # with pq.ParquetWriter(parquet_filename, table0.schema) as writer:
    #     writer.write_table(table0)
    #    for batches in util.grouper(reader, batch_size):
    #         table = pa.Table.from_batches(batches)
    #         writer.write_table(table)
    output_reader = pa.open_stream(imos.getvalue())
    return output_reader

In [None]:
# arrow_filename = '/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.subset1000.arrow'
# arrow_filename = '/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.subset15000.arrow'
# arrow_filename = '/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.subset15000.arrow'
arrow_filename = "/tmp/analysis_full_stream11_2.arrow"

In [None]:
%%time
# cols = ['position', 'label', 'filename', 't', 'trench', 'trench_set', 'channel']
# cols = ["('YFP', 'labelwise', 'p0.9')"]
cols = [
    "filename",
    "position",
    "channel",
    "t",
    "trench_set",
    "trench",
    "label",
    "('YFP', 'labelwise', 'p0.9')",
    "('MCHERRY', 'labelwise', 'p0.9')",
    "('YFP', 'regionprops', 'area')",
]
b = read_arrow(arrow_filename, cols)

In [None]:
%time table = b.read_all()

In [None]:
len(table)

In [None]:
table2 = table.set_column(3, table.column(3).dictionary_encode())