# Imports

In [None]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.feather as feather
import zarr
import dask
from dask import delayed
import distributed
from distributed import Client, LocalCluster, progress
from dask_jobqueue import SLURMCluster
import streamz
import streamz.dataframe as sdf
import holoviews as hv
from holoviews.streams import Stream, param, Selection1D
from holoviews.operation.datashader import regrid
from bokeh.models.tools import HoverTool, TapTool
import matplotlib.pyplot as plt
import qgrid
import ipywidgets as widgets
from tqdm import tnrange, tqdm, tqdm_notebook
import warnings
from functools import partial
from cytoolz import *
from operator import getitem
import nd2reader
from importlib import reload
import traceback
import hvplot.pandas
import param
import parambokeh
from traitlets import All
import cachetools
from collections import namedtuple, defaultdict
from collections.abc import Mapping, Sequence
from numbers import Number
import skimage.morphology
import scipy
from glob import glob
import asyncio
import time

IDX = pd.IndexSlice

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# from processing import *
# from trench_detection import *
# from trench_segmentation import *
# from trench_segmentation.watershed import *
# from util import *
# from ui import *
import common, trench_detection, util
import ui, diagnostics, metadata
import workflow, image, geometry
import trench_detection.hough, trench_detection.core
import trench_segmentation.watershed

In [None]:
#%load_ext line_profiler
hv.extension("bokeh")
%matplotlib inline
tqdm.monitor_interval = 0
asyncio.get_event_loop().set_debug(True)
import logging

logging.basicConfig(level=logging.DEBUG)

# Restore data

In [None]:
%store -r trench_points
%store -r trench_diag
%store -r trench_bboxes
trench_bboxes_t0 = util.get_one(trench_bboxes.groupby("t"))[1]

# Analysis

## Arrow copy with dictionary

In [None]:
def copy_arrow2(
    in_filename, out_filename, length=None, batch_size=10000, process_func=None
):
    in_file = pa.memory_map(in_filename)
    reader = pa.RecordBatchStreamReader(in_file)
    out_file = pa.OSFile(out_filename, "wb")
    table0 = pa.Table.from_batches([reader.read_next_batch()])
    if process_func is not None:
        table0 = process_func(table0)
    writer = pa.RecordBatchStreamWriter(out_file, table0.schema)
    writer.write_table(table0)
    if length is not None:
        reader = take(length, reader)
    t0 = time.time()
    for i, batches in enumerate(util.grouper(reader, batch_size)):
        if True:  # i % 100 == 0:
            t = time.time()
            dt = t - t0
            t0 = t
            print("batch", i, "time {:.2f}".format(dt))
        table = pa.Table.from_batches(batches)  # .drop(columns_to_drop)
        if process_func is not None:
            table = process_func(table)
        print("    rows per second", len(table) / dt)
        writer.write_table(table)


def fix_filename(table):
    for i in range(table.num_columns):
        if table.column(i).name == "filename":
            table = table.set_column(i, table.column(i).dictionary_encode())
    return table

In [None]:
%%time
copy_arrow2(
    "/tmp/analysis_full_stream11_2.arrow",
    "/tmp/analysis_full_stream11_2.cat.full.arrow",
    process_func=fix_filename,
)

In [None]:
%%time
t = pa.open_stream("/tmp/analysis_full_stream11_2.cat.arrow").read_all()

In [None]:
len(t)

In [None]:
t.schema.metadata[b"pandas"]

In [None]:
t2 = t.replace_schema_metadata()

In [None]:
%%time
# tp = t2.to_pandas(zero_copy_only=True, categories=['filename'], use_threads=True)
tp = t2.to_pandas(zero_copy_only=True, use_threads=True)

## Parquet with dictionary

In [None]:
def read_parquet(source, columns=None, categories=None, length=None):
    reader = pq.ParquetFile(source)
    t0 = time.time()
    tables = []
    category_idxs = None
    for i in range(length or reader.num_row_groups):
        if True:  # i % 100 == 0:
            t = time.time()
            dt = t - t0
            t0 = t
            print(
                "batch {}/{}".format(i, reader.num_row_groups), "time {:.2f}".format(dt)
            )
        table = reader.read_row_group(i, nthreads=4, columns=columns)
        # TODO
        # table = table.replace_schema_metadata()
        if categories is not None:
            if category_idxs is None:
                category_idxs = [
                    table.schema.get_field_index(column) for column in categories
                ]
            for idx in category_idxs:
                table = table.set_column(idx, table.column(idx).dictionary_encode())
            # category_table = pa.Table.from_arrays([table.column(idx).dictionary_encode() for idx in category_idxs])
            # category_tables.append(category_table)
            # table = table.drop(categories)
        print("    rows per second", len(table) / dt)
        tables.append(table)
    if categories is not None:
        for idx in category_idxs:
            category_columns = [table.column(idx) for table in tables]
            new_category_columns = harmonize_dictionaries(category_columns)
            for i, new_column in enumerate(new_category_columns):
                tables[i] = tables[i].set_column(idx, new_column)
    return pa.concat_tables(tables)

In [None]:
# cols = ['position', 'label', 'filename', 't', 'trench', 'trench_set', 'channel']
# cols = ["('YFP', 'labelwise', 'p0.9')"]
cols = [
    "filename",
    "position",
    "channel",
    "t",
    "trench_set",
    "trench",
    "label",
    "('YFP', 'labelwise', 'p0.9')",
    "('MCHERRY', 'labelwise', 'p0.9')",
    "('YFP', 'regionprops', 'area')",
]

In [None]:
import numpy_indexed as npi

In [None]:
%%time
imos = pa.BufferOutputStream()
in_file = pa.OSFile(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.parquet4"
)
imos.upload(in_file)
# tt = pq.read_pandas(imos.getvalue())#.to_pandas()

In [None]:
%%time
t4 = read_parquet(imos.getvalue(), columns=cols, categories=["filename"], length=None)

In [None]:
%%time
# parquet_filename = '/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.parquet4'
parquet_filename = (
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.parquet4"
)
# parquet_filename = '/tmp/analysis_full_stream11_2.parquet4'
t3 = read_parquet(parquet_filename, columns=cols, categories=["filename"], length=None)

In [None]:
%%time
t3.to_pandas().info()

In [None]:
%%time
t3.replace_schema_metadata().to_pandas().info()

In [None]:
%prun t3.to_pandas()

In [None]:
%prun t3.replace_schema_metadata().to_pandas()

In [None]:
k = t3.replace_schema_metadata().to_pandas()

In [None]:
%prun k.set_index('filename')

In [None]:
%%time
tp = t3.to_pandas()

In [None]:
pd.Categorical.from_codes?

In [None]:
tc3[0]

In [None]:
a = tc3[0].column(0)

In [None]:
b = a.data.chunk(0)

In [None]:
b.dictionary

In [None]:
pa.DictionaryArray?

In [None]:
pd.concat([c.to_pandas() for c in cols])

In [None]:
pd.concat([c.to_pandas() for c in cols])

In [None]:
cols = [tbl.column(0) for tbl in tc3]
dicts = [c.data.chunk(0).dictionary for c in cols]

In [None]:
dicts

In [None]:
set([0]) + set([1])

In [None]:
master_dictionary = list(set.union(*(set(d) for d in dicts)))

In [None]:
value_to_index = dict(map(reversed, enumerate(master_dictionary)))
value_to_index

In [None]:
dicts[3]

In [None]:
ch = cols[0].data.chunk(0)

In [None]:
pa.DictionaryArray.from_arrays(ch.indices, pa.array(["x"]))

In [None]:
ch.to_pandas()

In [None]:
pa.ChunkedArray?

In [None]:
ch.dictionary = ["hoo"]

In [None]:
ch.indices.to_numpy()

In [None]:
rewrite_rule = {orig: value_to_index[value] for orig, value in enumerate(dicts[3])}

In [None]:
z = cols[0]

In [None]:
pa.array(["/n/scratch2/jqs1/fidelity/all/180405_txnerr001.nd2", "z"])

In [None]:
master_dictionary[0]

In [None]:
pa.array(["x", master_dictionary[0]])

In [None]:
q = np.arange(10)

In [None]:
import numpy_indexed as npi

In [None]:
npi.remap(q, [1, 2], [11, 22])

In [None]:
def harmonize_dictionaries(columns):
    dictionaries = [
        list(map(str, column.data.chunk(0).dictionary)) for column in columns
    ]
    master_dictionary = pa.array(set.union(*(set(d) for d in dictionaries)))
    value_to_index = dict(map(reversed, enumerate(master_dictionary)))
    new_columns = []
    for column in columns:
        column_name = column.name
        rewrite_rules = {
            orig: value_to_index[value]
            for orig, value in enumerate(column.data.chunk(0).dictionary)
        }
        from_values = list(rewrite_rules.keys())
        to_values = list(rewrite_rules.values())
        new_chunks = []
        for chunk_idx in range(column.data.num_chunks):
            chunk = column.data.chunk(chunk_idx)
            ary = chunk.indices.to_numpy()
            npi.remap(ary, from_values, to_values, inplace=True)
            new_chunk = pa.DictionaryArray.from_arrays(chunk.indices, master_dictionary)
            new_chunks.append(new_chunk)
        chunked_ary = pa.chunked_array(new_chunks)
        new_column = pa.Column.from_array(column_name, chunked_ary)
        new_columns.append(new_column)
    return new_columns

In [None]:
z = harmonize_dictionaries(cols)

In [None]:
tz = pa.concat_tables([pa.Table.from_arrays([c]) for c in z])

In [None]:
%%time
tzp = tz.to_pandas()

In [None]:
tp

In [None]:
%%time
pq.write_table(
    pa.Table.from_pandas(tp),
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.nofilename.parquet4",
)

In [None]:
%%time
tt = pq.read_pandas(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.nofilename.parquet4"
).to_pandas()

In [None]:
%%time
imos = pa.BufferOutputStream()
in_file = pa.OSFile(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.nofilename.parquet4"
)
imos.upload(in_file)
tt = pq.read_pandas(imos.getvalue())  # .to_pandas()

In [None]:
tt2 = tt.replace_schema_metadata()

In [None]:
%%time
tp2 = tt2.to_pandas()

In [None]:
%%time
tp.info(memory_usage="deep")

## Parquet redux

In [None]:
%%time
pq.write_to_dataset(
    rg0, root_path="/tmp/parq_test", partition_cols=["filename", "position"]
)

In [None]:
def read_parquet(filename):
    reader = pq.ParquetFile(filename)
    table = pa.concat_tables(
        [reader.read_row_group(i) for i in range(reader.num_row_groups)]
    )
    return table

In [None]:
reader = pq.ParquetFile("/tmp/analysis_full_stream11_2.parquet4")

In [None]:
rg0 = reader.read_row_group(1)

In [None]:
idx = rg0.schema.get_field_index("filename")
rg1 = rg0.set_column(idx, rg0.column(idx).dictionary_encode())

In [None]:
len(rg1)

In [None]:
%%time
tp = rg0.to_pandas()

In [None]:
%%time
t = read_parquet("/tmp/analysis_full_stream11_2.parquet4")

In [None]:
tp

In [None]:
pq.write_table(rg1, "/tmp/analysis_full_stream11_2.3cols.rg100000.rg1.parquet")

In [None]:
rg1roundtrip = pq.read_table(
    "/tmp/analysis_full_stream11_2.3cols.rg100000.rg1.parquet",
    use_pandas_metadata=False,
)

In [None]:
rg1roundtrip

In [None]:
rg1

In [None]:
%%time
t = read_parquet("/tmp/analysis_full_stream11_2.3cols.rg10000.parquet")

In [None]:
t2 = t.drop(["position", "t", "trench_set", "trench", "label"])

In [None]:
%%time
tp = t.to_pandas(zero_copy_only=True, use_threads=True, categories=["filename"])

In [None]:
%%time
pq.write_table(t3, "/tmp/analysis_full_stream11_2.3cols.scienceonly.nomd.parquet")

In [None]:
tp

## Arrow

In [None]:
arrow_filename = (
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.subset1000.arrow"
)

In [None]:
columns = ["position", "label"]

In [None]:
columns_to_drop = list(set(c.name for c in table0.columns) - set(columns))

In [None]:
columns_to_drop

In [None]:
table1 = table0.drop(columns_to_drop)

In [None]:
reader = pa.open_stream(arrow_filename)
table0 = pa.Table.from_batches([reader.read_next_batch()])
imos = pa.BufferOutputStream()
buffer = imos.getvalue()
writer = pa.RecordBatchStreamWriter(buffer, table0)

In [None]:
def read_arrow(arrow_filename, columns, categorical_columns=None, batch_size=1000):
    reader = pa.open_stream(arrow_filename)
    table0 = pa.Table.from_batches([reader.read_next_batch()])
    columns_to_drop = list(set(c.name for c in table0.columns) - set(columns))
    table1 = table0.drop(columns_to_drop)
    imos = pa.BufferOutputStream()
    writer = pa.RecordBatchStreamWriter(imos, table1.schema)
    t0 = time.time()
    for i, batches in enumerate(util.grouper(reader, batch_size)):
        if True:  # i % 100 == 0:
            t = time.time()
            dt = t - t0
            t0 = t
            print("batch", i, "time {:.2f}".format(dt))
        table = pa.Table.from_batches(batches).drop(columns_to_drop)
        for i in range(table.num_columns):
            if table.column(i).name == "filename":
                table = table.set_column(i, table.column(i).dictionary_encode())
        print("    rows per second", len(table) / dt)
        writer.write_table(table)
    # with pq.ParquetWriter(parquet_filename, table0.schema) as writer:
    #     writer.write_table(table0)
    #    for batches in util.grouper(reader, batch_size):
    #         table = pa.Table.from_batches(batches)
    #         writer.write_table(table)
    output_reader = pa.open_stream(imos.getvalue())
    return output_reader

In [None]:
# arrow_filename = '/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.subset1000.arrow'
# arrow_filename = '/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.subset15000.arrow'
# arrow_filename = '/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.subset15000.arrow'
arrow_filename = "/tmp/analysis_full_stream11_2.arrow"

In [None]:
%%time
# cols = ['position', 'label', 'filename', 't', 'trench', 'trench_set', 'channel']
# cols = ["('YFP', 'labelwise', 'p0.9')"]
cols = [
    "filename",
    "position",
    "channel",
    "t",
    "trench_set",
    "trench",
    "label",
    "('YFP', 'labelwise', 'p0.9')",
    "('MCHERRY', 'labelwise', 'p0.9')",
    "('YFP', 'regionprops', 'area')",
]
b = read_arrow(arrow_filename, cols)

In [None]:
%time table = b.read_all()

In [None]:
len(table)

In [None]:
table2 = table.set_column(3, table.column(3).dictionary_encode())

In [None]:
arrow_file = pa.OSFile("/tmp/analysis_full_stream11_2.3cols.dict3.arrow", "wb")
writer = pa.RecordBatchStreamWriter(arrow_file, table3.schema)

In [None]:
%%time
writer.write_table(table3)

In [None]:
table3 = table2.drop(["filename"])

In [None]:
%%time
tp = table.to_pandas(zero_copy_only=True, strings_to_categorical=True)

In [None]:
%%time
tp.info()  # memory_usage='deep')

In [None]:
%%time
pq.write_table(
    table, "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.3cols.parquet"
)

In [None]:
%%time
pq.write_table(
    table,
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.3cols.rg1000.parquet",
    row_group_size=1000,
)

In [None]:
%%time
pq.write_table(
    table,
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.3cols.rg1000000.parquet",
    row_group_size=1000000,
)

In [None]:
%%time
pq.write_table(table3, "/tmp/analysis_full_stream11_2.3cols.nostr.parquet")

In [None]:
%%time
pq.write_table(
    table,
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.3cols.rg100000.parquet",
    row_group_size=100000,
)

In [None]:
%%time
pq.write_table(
    pa.Table.from_pandas(tp),
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.3cols.roundtrip.parquet",
)

In [None]:
pq.write_table?

In [None]:
%%time
pt = pq.read_table("/tmp/analysis_full_stream11_2.3cols.nostr.parquet")

In [None]:
import fastparquet

In [None]:
%%time
cols = [
    "('MCHERRY', 'labelwise', 'p0.9')",
    "('YFP', 'labelwise', 'p0.9')",
    "('YFP', 'regionprops', 'area')",
    "position",
    "t",
    "trench_set",
    "trench",
    "label",
]
ptf = fastparquet.ParquetFile(
    "/tmp/analysis_full_stream11_2.3cols.nostr.parquet"
).to_pandas(columns=None, index=False)

In [None]:
ptf.to_pandas?

In [None]:
ptf

In [None]:
ptf.

In [None]:
%%time
pt.to_pandas()

In [None]:
_.info(memory_usage="deep")

In [None]:
%%time
tp = pq.read_pandas(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.3cols.rg100000.parquet"
)  # .to_pandas()

In [None]:
# parquet_filename = '/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.3cols.rg100000.parquet'
parquet_filename = "/tmp/analysis_full_stream11_2.3cols.rg100000.parquet"
f = pq.ParquetFile(parquet_filename)

In [None]:
f.num_row_groups

In [None]:
len(f.read_row_group(0))

In [None]:
t = f.read_row_group(0)

In [None]:
t

In [None]:
%%time
t0 = time.time()
for i in range(f.num_row_groups):
    row_group = f.read_row_group(i)
    if i % 100 == 0:
        t = time.time()
        dt = t - t0
        t0 = t
        print("batch", i, "time {:.2f}".format(dt))

In [None]:
%%time
tpq = pa.concat_tables([f.read_row_group(i) for i in range(f.num_row_groups)])

In [None]:
c = tpq.column(3)

In [None]:
tpq2 = tpq.set_column(3, tpq.column(3).dictionary_encode())

In [None]:
%%time
pq.write_table(
    tpq2,
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.3cols.rg10000.dict.parquet",
    row_group_size=10000,
)

In [None]:
pa.write

In [None]:
%%time
c.dictionary_encode()

In [None]:
%%time
tpqs = tpq.to_pandas(strings_to_categorical=True)

In [None]:
tpqs

In [None]:
x = pa.array(list("1" * 2**30))

demo = "demo.parquet"


def scenario():
    t = pa.Table.from_arrays([x], ["x"])
    writer = pq.ParquetWriter(demo, t.schema)
    for i in range(2):
        writer.write_table(t)
    writer.close()

    pf = pq.ParquetFile(demo)

    # pyarrow.lib.ArrowIOError: Arrow error: Invalid: BinaryArray cannot contain more than 2147483646 bytes, have 2147483647
    t2 = pf.read()

In [None]:
scenario()

In [None]:
%%time
f.read().to_pandas()

In [None]:
f.read_row_group(10000).to_pandas()

In [None]:
%%time
fastparquet.write(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.3cols.fastparquet",
    tp,
)

In [None]:
tp

## Parquetify

In [None]:
def parquetify(arrow_filename, parquet_filename=None, batch_size=1000):
    if parquet_filename is None:
        parquet_filename = arrow_filename.replace(".arrow", ".parquet")
    reader = pa.open_stream(arrow_filename)
    table0 = pa.Table.from_batches([reader.read_next_batch()])
    with pq.ParquetWriter(parquet_filename, table0.schema) as writer:
        writer.write_table(table0)
        for batches in util.grouper(reader, batch_size):
            table = pa.Table.from_batches(batches)
            writer.write_table(table)

In [None]:
%%time
parquetify(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_0.arrow",
    parquet_filename="/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_0.test.parquet",
)

In [None]:
z = pq.ParquetFile(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_1.parquet"
)

In [None]:
z.num_row_groups

In [None]:
%%time
pq.read_table(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_0.test.parquet"
).to_pandas()

In [None]:
%%time
parquetify("/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_0.arrow")

In [None]:
%%time
parquetify("/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_1.arrow")

In [None]:
%%timeit
open(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_1.parquet", "rb"
).read()

In [None]:
%%time
pq.read_table(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_1.parquet", nthreads=4
).to_pandas()

## Read Parquet from memory

In [None]:
buf = pa.BufferOutputStream()

In [None]:
in_file = pa.OSFile(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_1.arrow"
)

In [None]:
%%time
buf.upload(in_file, buffer_size=2**20)

In [None]:
bufr = pa.BufferReader(buf.getvalue())

In [None]:
bufr.seek(0)

In [None]:
%%time
t = pa.open_stream(bufr).read_all()

In [None]:
in_file.seek(0)

In [None]:
pa.MemoryMappedFile?

In [None]:
in_file2 = pa.MemoryMappedFile(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_1.arrow"
)

In [None]:
in_file2 = pa.memory_map(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_1.arrow"
)

In [None]:
in_file2.seek(0)

In [None]:
in_file2.read(100)

In [None]:
%%time
tt = pa.open_stream(in_file2).read_all()

In [None]:
t = pq.read_table(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_1.parquet", nthreads=1
)

In [None]:
%%time
pq.write_table(
    t,
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_1.test.parquet",
    compression="zstd",
    row_group_size=10000,
    version="2.0",
)

In [None]:
%%time
pq.read_table(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_1.test.parquet",
    nthreads=4,
)

In [None]:
%prun pq.read_table('/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_1.parquet', nthreads=1)

In [None]:
%prun pq.ParquetFile(bufr).read()

In [None]:
%%time
pq.ParquetFile(bufr).read()

## Parquetify2

In [None]:
def copy_arrow(in_filename, out_filename, length=None, batch_size=1000):
    in_file = pa.memory_map(in_filename)
    reader = pa.RecordBatchStreamReader(in_file)
    out_file = pa.OSFile(out_filename, "wb")
    table0 = pa.Table.from_batches([reader.read_next_batch()])
    writer = pa.RecordBatchStreamWriter(out_file, table0.schema)
    writer.write_table(table0)
    if length is not None:
        reader = take(length, reader)
    for batches in util.grouper(reader, batch_size):
        table = pa.Table.from_batches(batches)
        writer.write_table(table)

In [None]:
%%time
copy_arrow(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.arrow",
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.subset.arrow",
    batch_size=1000,
    length=1000,
)

In [None]:
%%time
copy_arrow(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.subset.arrow",
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.subset.rebatched1000.arrow",
    batch_size=1000,
    length=None,
)

In [None]:
%%time
copy_arrow(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.subset.arrow",
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.subset.rebatched10000.arrow",
    batch_size=10000,
    length=None,
)

In [None]:
%%time
copy_arrow(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.subset.arrow",
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.subset.rebatched100.arrow",
    batch_size=100,
    length=None,
)

In [None]:
%%time
parquetify2(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.subset.arrow",
    batch_size=1,
    length=1000,
)

In [None]:
%%time
parquetify2(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.subset.rebatched100.arrow",
    batch_size=1,
    length=1000,
)

In [None]:
%%time
parquetify2(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.subset.rebatched1000.arrow",
    batch_size=1,
    length=1000,
)

In [None]:
%%time
parquetify2(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.subset.rebatched10000.arrow",
    batch_size=1,
    length=1000,
)

In [None]:
in_file = pa.OSFile(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.subset.arrow"
)

In [None]:
reader = pa.RecordBatchStreamReader(in_file)

In [None]:
batch = reader.read_next_batch()

In [None]:
batches = list(take(10, reader))

In [None]:
table = pa.Table.from_batches(batches)

In [None]:
len(table)

In [None]:
batches2 = table.to_batches(chunksize=10000)

In [None]:
len(batches2)

In [None]:
len(batches[0])

In [None]:
len(batches2[0])

In [None]:
col = table.column(0)

In [None]:
col.data.

In [None]:
pa.Table.from_batches([batches2[0]]).to_pandas().info(memory_usage="deep")

In [None]:
len(batches2[0])

In [None]:
len(batch)

In [None]:
def parquetify2(arrow_filename, parquet_filename=None, batch_size=1000, length=None):
    if parquet_filename is None:
        parquet_filename = arrow_filename.replace(".arrow", ".parquet4")
    arrow_file = pa.OSFile(arrow_filename)
    # arrow_file = pa.memory_map(arrow_filename)
    # parquet_mmap = pa.memory_map(parquet_filename, 'wb')
    reader = pa.open_stream(arrow_file)
    table0 = pa.Table.from_batches([reader.read_next_batch()])
    if length is not None:
        reader = take(length, reader)
    with pq.ParquetWriter(parquet_filename, table0.schema) as writer:
        # with pq.ParquetWriter(parquet_mmap, table0.schema) as writer:
        writer.write_table(table0)
        for batches in util.grouper(reader, batch_size):
            table = pa.Table.from_batches(batches)
            writer.write_table(table)
    # arrow_file.close()
    # parquet_mmap.flush()
    # parquet_mmap.close()

In [None]:
%%time
copy_arrow(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.arrow",
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.subset15000.arrow",
    length=15000,
    batch_size=1,
)

In [None]:
%%time
parquetify2(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.arrow",
    batch_size=1000,
)

In [None]:
cols = [
    "('YFP', 'labelwise', 'p0.9')",
    "('MCHERRY', 'labelwise', 'p0.9')",
    "('YFP', 'regionprops', 'mean')",
    "('YFP', 'regionprops', 'area')",
]

In [None]:
# cols = ["('YFP', 'labelwise', 'p0.9')"]
# cols = ['position']
# cols = ["('YFP', 'labelwise', 'p0.9')", 'filename', 'position']
# cols = ["('YFP', 'labelwise', 'p0.9')", 'position', 'trench', 'label']
cols = ["('YFP', 'labelwise', 'p0.9')", "position", "trench", "label", "filename"]

In [None]:
%%time
# reader = pq.ParquetFile('/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.parquet4')
reader = pq.ParquetFile(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.onecol.rg10000.parquet4"
)
# for i in range(0,reader.num_row_groups):
#     print('row group', i)
#     reader.read_row_group(i, use_pandas_metadata=True)
tables = [
    reader.read_row_group(i, columns=None, nthreads=1, use_pandas_metadata=False)
    for i in range(reader.num_row_groups)
]
table = pa.concat_tables(tables)
# t = table.to_pandas(use_threads=True, strings_to_categorical=True)

In [None]:
import fastparquet

In [None]:
%%time
tf = fastparquet.ParquetFile(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.parquet4"
)

In [None]:
next(tf.iter_row_groups())

In [None]:
tf.row_groups[0]

In [None]:
tf.read_row_group(0, [("YFP", "labelwise", "p0.9")], {})

In [None]:
%%time
tf = fastparquet.ParquetFile(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.parquet4"
)  # .to_pandas()

In [None]:
tff = tf.to_pandas(columns=["position"])

In [None]:
%%time
tf = fastparquet.ParquetFile(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.onecol.rg10000.parquet4"
).to_pandas()

In [None]:
pd.read_parquet?

In [None]:
%%time
reader = pq.ParquetFile(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.parquet4"
)
tables = [
    reader.read_row_group(i, columns=cols, nthreads=4, use_pandas_metadata=False)
    for i in range(reader.num_row_groups)
]
table = pa.concat_tables(tables)
# t = table.to_pandas(use_threads=True, strings_to_categorical=True)

In [None]:
%%time
t = table.to_pandas(use_threads=True, strings_to_categorical=True)

In [None]:
%%time
pq.write_table(
    pa.Table.from_pandas(t),
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.onecol.rg10000.parquet4",
    row_group_size=10000,
)

In [None]:
%%time
tsel = pq.read_pandas(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.onecol.rg10000.parquet4"
).to_pandas()

In [None]:
t.head()

In [None]:
import sys

In [None]:
len(t[("YFP", "labelwise", "p0.9")].data)

In [None]:
t.info(memory_usage="deep")

In [None]:
%%time
t2 = t.reset_index()

In [None]:
%%time
t2.info(memory_usage="deep")

In [None]:
%%time
t = pq.read_pandas(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.parquet4",
    nthreads=0,
    columns=cols,
)  # .to_pandas()

In [None]:
%%time
tp = t.to_pandas(use_threads=True)

In [None]:
tp.info(memory_usage="deep")

In [None]:
reader = pq.ParquetFile(
    pa.OSFile("/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.parquet4")
)

In [None]:
a = reader.read_row_group(0)

In [None]:
a.schema

In [None]:
len(t)

In [None]:
%%time
tt = (
    pa.open_stream(
        pa.memory_map(
            "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.subset1000.arrow"
        )
    )
    .read_all()
    .to_pandas()
)

In [None]:
%%time
ttt = (
    pa.open_stream(
        pa.OSFile(
            "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_2.subset1000.arrow"
        )
    )
    .read_all()
    .to_pandas()
)

In [None]:
%%time
t.to_pandas()

In [None]:
%%time
tt.to_pandas()

In [None]:
tt.to_pandas?

In [None]:
%prun parquetify2('/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_0.arrow')

## Load data

In [None]:
%%time
framewise_df = pa.open_stream(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_0.arrow"
).read_pandas()

In [None]:
framewise_df.info(memory_usage="deep")

In [None]:
s = pa.open_stream("/n/scratch2/jqs1/fidelity/all/output/analysis50_stream_0.arrow")

In [None]:
for a in take(2, s):
    print(a)

In [None]:
s.read_next_batch?

In [None]:
s.read_pandas()

In [None]:
%prun pa.open_stream('/n/scratch2/jqs1/fidelity/all/output/analysis50_stream_0.arrow').read_all()#.read_pandas()

In [None]:
%prun pa.open_stream('/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_0.arrow').read_all()#.read_pandas()

In [None]:
%%time
trenchwise_df = pa.open_stream(
    "/n/scratch2/jqs1/fidelity/all/output/analysis_full_stream11_1.arrow"
).read_pandas()

In [None]:
trenchwise_df.info(memory_usage="deep")

In [None]:
%%time
labelwise_df = pq.read_pandas(
    "/n/scratch2/jqs1/fidelity/all/output/analysis50_full_2.parquet"
).to_pandas()

In [None]:
labelwise_df.info(memory_usage="deep")

In [None]:
# labelwise_df.index.names = ['filename', 'position', 't', 'trench_set', 'trench', 'label']
# labelwise_df.sort_index(inplace=True)

In [None]:
len(labelwise_df)

In [None]:
framewise_df.head()

In [None]:
trenchwise_df.head()

In [None]:
labelwise_df.head()