In [None]:
from io import StringIO, BytesIO
import time 
import threading

from IPython.display import Javascript
import pandas as pd
import numpy as np

import anywidget
import traitlets

import buckaroo

In [None]:
bool_df = pd.DataFrame([{'a':False}, {'a':True}])
bool_df

In [None]:
numeric_columns = pd.DataFrame([['foo', 8], ['bar',23]])
# older json based Buckaroo works with numeric columns
# BuckarooInfiniteWidget - the new default - which uses parquet doesn't
buckaroo.BuckarooWidget(numeric_columns) 

In [None]:
buckaroo.BuckarooWidget(numeric_columns)

In [None]:
#continguous
object_values = pd.DataFrame([
    {'a':{'foo':9, 'bar':10}},
    {'a':{'foo':3, 'bar':5}}])
object_values

In [None]:
#different key names
object_values_varied = pd.DataFrame([
    {'a':{'foo':9, 'qux':10}},
    {'a':{'foo':3, 'bar':5}}])
object_values_varied

In [None]:
date_values = pd.DataFrame([{'a':pd.Timestamp('now')},
                            {'a':pd.Timestamp('2014-01-02')}])
print(date_values.dtypes)
date_values

In [None]:
odd_integer_values = pd.DataFrame([
    {'a':np.nan},
    {'a':np.inf * -1},
    {'a':np.inf},
    {'a':np.iinfo(np.int64).min},
    {'a':np.iinfo(np.int64).max},
])
odd_integer_values    

In [None]:
over_64 = pd.DataFrame([
    {'a':np.nan},
    {'a':np.iinfo(np.int64).min * 2},
    {'a':np.iinfo(np.int64).max * 2},
])
odd_integer_values    

In [None]:
lists_df = pd.DataFrame([
    {'a': [10, 20, 30]},
    {'a': [30, 20, 40]}])
lists_df

In [None]:
uneven_lists_df = pd.DataFrame([
    {'a': [10, 20, 30]},
    {'a': [30, 40]}])
uneven_lists_df

In [None]:
categorical_df = pd.DataFrame({'a': pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])})
categorical_df

In [None]:
multi_index_rows = pd.DataFrame(
    [{'a':10}, {'a':20}, {'a':30}, {'a':40}, {'a':50}],
    index=pd.MultiIndex.from_tuples(
        [('foo', 3), ('foo', 4), ('bar', 3), ('bar', 4), ('bar', 5)]))
multi_index_columns = multi_index_rows.T
print(multi_index_rows) #Buckaroo doesn't work with multi indexes

In [None]:
string_index = pd.DataFrame([{'a':10}, {'a':20}], index=["foo", "bar"])
print(string_index) # Buckaroo doesn't like string indexes

In [None]:
mixed_bool_str = pd.DataFrame([
    {'a': 'a string'},
    {'a': True},
    {'a': False}])

In [None]:
def roundtrip_json(df):
    return df.equals(pd.read_json(StringIO(df.to_json())))
def roundtrip_feather(df):
    byts = BytesIO()
    try:
        df.to_feather(byts)
        return df.equals(pd.read_feather(byts))
    except:
        return "error"
def roundtrip_parquet(df):
    try:
        return df.equals(pd.read_parquet(BytesIO(df.to_parquet())))
    except:
        return "error"
def roundtrip_fastparquet(df):
    try:
        return df.equals(pd.read_parquet(BytesIO(df.to_parquet(engine='fastparquet')), engine='fastparquet'))
    except:
        return "error"

def roundtrip_fastparquet_json(df):
    try:
        out = BytesIO(df.to_parquet(engine='fastparquet', object_encoding={'a': 'json'}))
        return df.equals(pd.read_parquet(out, engine='fastparquet'))
    except:
        return "error"    
         
def roundtrip_pickle(df):
    df.to_pickle('byts')
    return df.equals(pd.read_pickle('byts'))

roundtrip_fastparquet(uneven_lists_df)
#roundtrip_feather(numeric_columns)
#roundtrip_pickle(numeric_columns)

In [None]:
#pd.read_parquet(BytesIO(mixed_bool_str.to_parquet(engine='fastparquet', object_encoding={'a':'bson'})), engine='fastparquet')


In [None]:
dfs = dict(
    numeric_columns = numeric_columns,
    object_values = object_values,
    object_values_varied = object_values_varied,
    date_values = date_values,
    odd_integer_values = odd_integer_values,
    lists_df = lists_df,
    uneven_lists_df = uneven_lists_df,
    multi_index_rows = multi_index_rows,
    multi_index_columns = multi_index_columns,
    mixed_bool_str = mixed_bool_str,
    mixed_bool_str2 = mixed_bool_str2,
    categorical_df = categorical_df)
results = []
for k, df in dfs.items():
    results.append({
        'name':k,
        'json':roundtrip_json(df), 'feather':roundtrip_feather(df), 
        'parquet':roundtrip_parquet(df), 
        'fast_parquet':roundtrip_fastparquet(df), 'fast_parquet_json':roundtrip_fastparquet_json(df),
        'pickle': roundtrip_pickle(df), 
                   })
#results_df = pd.DataFrame(results, index=dfs.keys())
results_df = pd.DataFrame(results) #Buckaroo doesn't work with string indexes
results_df

In [None]:
class SerializationWidget(anywidget.AnyWidget):
    _esm = """
    import * as hyparquet from "https://cdn.jsdelivr.net/npm/hyparquet@1.8.4/+esm";
    function render({ model, el }) {
      console.log("hyparquet", hyparquet)

        const table_bytes = model.get("df_parquet")
        console.log("table_bytes", table_bytes.length, table_bytes)
        const metadata = hyparquet.parquetMetadata(table_bytes.buffer)
        console.log("metadata", metadata)
        hyparquet.parquetRead({
            file: table_bytes.buffer,
            metadata:metadata,
            rowFormat: 'object',
            onComplete: data => {
                const parqData = data;
                console.log("parqData", parqData)
                model.set("df_json", data)
                model.save_changes();
            }
      })
    }
    export default { render };
    """
    df_parquet = traitlets.Bytes().tag(sync=True)
    df_json = traitlets.Any().tag(sync=True)

cw = SerializationWidget(df_parquet=bool_df.to_parquet(engine='fastparquet'))
display(cw)
time.sleep(1)
out_df = pd.DataFrame(cw.df_json)

In [None]:
def bucakroo_to_parquet(df):
    obj_columns = df.select_dtypes([pd.CategoricalDtype, 'object']).columns.to_list()
    encodings = {k:'json' for k in obj_columns}
    return df.to_parquet(engine='fastparquet', object_encoding=encodings)
    pd.read_parquet(BytesIO(mixed_bool_str.to_parquet(engine='fastparquet', object_encoding={'a':'bson'})), engine='fastparquet')


In [None]:
def rountrip_widget(df):
    cw2 = SerializationWidget(df_parquet=bucakroo_to_parquet(df))
    display(Javascript("console.clear()"))
    def worker():
        display(cw2)
        time.sleep(.3)

    w = threading.Thread(name='worker', target=worker)
    w.start()
    w.join()
    return df.equals(pd.DataFrame(cw.df_json))
rountrip_widget(bool_df)

In [None]:
cw.df_json