In [None]:
%pip install jupyter_ui_poll fastparquet anywidget

In [1]:
from io import StringIO, BytesIO
import time 

from IPython.display import Javascript
import pandas as pd
import numpy as np

import anywidget
import traitlets

#import buckaroo

In [2]:
bool_df = pd.DataFrame([{'a':False}, {'a':True}])
bool_df

Unnamed: 0,a
0,False
1,True


In [3]:
numeric_columns = pd.DataFrame([['foo', 8], ['bar',23]])
# older json based Buckaroo works with numeric columns
# BuckarooInfiniteWidget - the new default - which uses parquet doesn't
#buckaroo.BuckarooWidget(numeric_columns) 

In [4]:
#continguous
object_values = pd.DataFrame([
    {'a':{'foo':9, 'bar':10}},
    {'a':{'foo':3, 'bar':5}}])
object_values

Unnamed: 0,a
0,"{'foo': 9, 'bar': 10}"
1,"{'foo': 3, 'bar': 5}"


In [5]:
#different key names
object_values_varied = pd.DataFrame([
    {'a':{'foo':9, 'qux':10}},
    {'a':{'foo':3, 'bar':5}}])
object_values_varied

Unnamed: 0,a
0,"{'foo': 9, 'qux': 10}"
1,"{'foo': 3, 'bar': 5}"


In [6]:
date_values = pd.DataFrame([{'a':pd.Timestamp('now')},
                            {'a':pd.Timestamp('2014-01-02')}])
print(date_values.dtypes)
date_values

a    datetime64[ns]
dtype: object


Unnamed: 0,a
0,2025-03-01 09:19:35.732574
1,2014-01-02 00:00:00.000000


In [7]:
odd_integer_values = pd.DataFrame([
    {'a':np.nan},
    {'a':np.inf * -1},
    {'a':np.inf},
    {'a':np.iinfo(np.int64).min},
    {'a':np.iinfo(np.int64).max},
])
odd_integer_values    

Unnamed: 0,a
0,
1,-inf
2,inf
3,-9.223372e+18
4,9.223372e+18


In [8]:
over_64 = pd.DataFrame([
    {'a':np.nan},
    {'a':np.iinfo(np.int64).min * 2},
    {'a':np.iinfo(np.int64).max * 2},
])
odd_integer_values    

Unnamed: 0,a
0,
1,-inf
2,inf
3,-9.223372e+18
4,9.223372e+18


In [9]:
lists_df = pd.DataFrame([
    {'a': [10, 20, 30]},
    {'a': [30, 20, 40]}])
lists_df

Unnamed: 0,a
0,"[10, 20, 30]"
1,"[30, 20, 40]"


In [10]:
uneven_lists_df = pd.DataFrame([
    {'a': [10, 20, 30]},
    {'a': [30, 40]}])
uneven_lists_df

Unnamed: 0,a
0,"[10, 20, 30]"
1,"[30, 40]"


In [11]:
categorical_df = pd.DataFrame({'a': pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])})
categorical_df

Unnamed: 0,a
0,a
1,b
2,c
3,a
4,b
5,c


In [12]:
multi_index_rows = pd.DataFrame(
    [{'a':10}, {'a':20}, {'a':30}, {'a':40}, {'a':50}],
    index=pd.MultiIndex.from_tuples(
        [('foo', 3), ('foo', 4), ('bar', 3), ('bar', 4), ('bar', 5)]))
multi_index_columns = multi_index_rows.T
print(multi_index_rows) #Buckaroo doesn't work with multi indexes

        a
foo 3  10
    4  20
bar 3  30
    4  40
    5  50


In [13]:
string_index = pd.DataFrame([{'a':10}, {'a':20}], index=["foo", "bar"])
print(string_index) # Buckaroo doesn't like string indexes

      a
foo  10
bar  20


In [14]:
mixed_bool_str = pd.DataFrame([
    {'a': 'a string'},
    {'a': True},
    {'a': False}])

In [15]:
def bucakroo_to_parquet(df):
    obj_columns = df.select_dtypes([pd.CategoricalDtype(), 'object']).columns.to_list()
    encodings = {k:'json' for k in obj_columns}
    return df.to_parquet(engine='fastparquet', object_encoding=encodings)
    pd.read_parquet(BytesIO(mixed_bool_str.to_parquet(engine='fastparquet', object_encoding={'a':'bson'})), engine='fastparquet')

class SerializationWidget(anywidget.AnyWidget):
    _esm = """
    import * as hyparquet from "https://cdn.jsdelivr.net/npm/hyparquet@1.8.4/+esm";
    function render({ model, el }) {
      console.log("hyparquet", hyparquet)

        const table_bytes = model.get("df_parquet")
        console.log("table_bytes", table_bytes.length, table_bytes)
        const metadata = hyparquet.parquetMetadata(table_bytes.buffer)
        console.log("metadata", metadata)
        hyparquet.parquetRead({
            file: table_bytes.buffer,
            metadata:metadata,
            rowFormat: 'object',
            onComplete: data => {
                const parqData = data;
                console.log("parqData", parqData)
                model.set("df_json", data)
                model.save_changes();
            }
      })
    }
    export default { render };
    """
    df_parquet = traitlets.Bytes().tag(sync=True)
    df_json = traitlets.Any().tag(sync=True)

cw = SerializationWidget(df_parquet=bucakroo_to_parquet(bool_df))
display(cw)
#check the javascript console to see what cw logged

SerializationWidget(df_parquet=b'PAR1\x15\x00\x15\x1e\x15",\x15\x04\x15\x00\x15\x06\x15\x08\x00\x00\x0f8\x02\x…

In [16]:
from jupyter_ui_poll import ui_events
#adapted from  https://stackoverflow.com/questions/54629964/how-to-pause-jupyter-notebook-widgets-waiting-for-user-input

def rountrip_widget_df(df):
    """ just returns the df """
    try:
        cw2 = SerializationWidget(df_parquet=bucakroo_to_parquet(df))
        display(Javascript("console.clear()"))
        display(cw2)
        with ui_events() as poll:
            while cw2.df_json is None:
                #waiting for user input
                time.sleep(.1)
                poll(1) # poll queued UI events including button
                pass
        return pd.DataFrame(cw2.df_json)
    except:
        return "error"
rountrip_widget_df(bool_df)

<IPython.core.display.Javascript object>

SerializationWidget(df_parquet=b'PAR1\x15\x00\x15\x1e\x15",\x15\x04\x15\x00\x15\x06\x15\x08\x00\x00\x0f8\x02\x…

Unnamed: 0,a
0,False
1,True


In [17]:
def roundtrip_json(df):
    return df.equals(pd.read_json(StringIO(df.to_json())))
def roundtrip_feather(df):
    byts = BytesIO()
    try:
        df.to_feather(byts)
        return df.equals(pd.read_feather(byts))
    except:
        return "error"
def roundtrip_parquet(df):
    try:
        return df.equals(pd.read_parquet(BytesIO(df.to_parquet())))
    except:
        return "error"
def roundtrip_fastparquet(df):
    try:
        return df.equals(pd.read_parquet(BytesIO(df.to_parquet(engine='fastparquet')), engine='fastparquet'))
    except:
        return "error"

def roundtrip_fastparquet_json(df):
    try:
        out = BytesIO(df.to_parquet(engine='fastparquet', object_encoding={'a': 'json'}))
        return df.equals(pd.read_parquet(out, engine='fastparquet'))
    except:
        return "error"
def rountrip_widget_test(df):
    try:
        return df.equals(rountrip_widget_df(df))
    except:
        return "error"
def roundtrip_pickle(df):
    df.to_pickle('byts')
    return df.equals(pd.read_pickle('byts'))


In [18]:
dfs = dict(
    numeric_columns = numeric_columns,
    bool_df=bool_df,
    object_values = object_values,
    object_values_varied = object_values_varied,
    date_values = date_values,
    odd_integer_values = odd_integer_values,
    lists_df = lists_df,
    uneven_lists_df = uneven_lists_df,
    multi_index_rows = multi_index_rows,
    multi_index_columns = multi_index_columns,
    mixed_bool_str = mixed_bool_str,
    categorical_df = categorical_df)
results = []
for k, df in dfs.items():
    results.append({
        'name':k,
        'json':roundtrip_json(df), 'feather':roundtrip_feather(df),
        'fast_parquet_json':roundtrip_fastparquet_json(df),
        'widget': rountrip_widget_test(df),

        'parquet':roundtrip_parquet(df),
        'fast_parquet':roundtrip_fastparquet(df), 
        'pickle': roundtrip_pickle(df), 
                   })
#results_df = pd.DataFrame(results, index=dfs.keys())
results_df = pd.DataFrame(results) #Buckaroo doesn't work with string indexes
results_df

<IPython.core.display.Javascript object>

SerializationWidget(df_parquet=b'PAR1\x15\x00\x15\x1e\x15",\x15\x04\x15\x00\x15\x06\x15\x08\x00\x00\x0f8\x02\x…

<IPython.core.display.Javascript object>

SerializationWidget(df_parquet=b'PAR1\x15\x00\x15r\x15`,\x15\x04\x15\x00\x15\x06\x15\x08\x00\x009p\x02\x00\x00…

<IPython.core.display.Javascript object>

SerializationWidget(df_parquet=b'PAR1\x15\x00\x15r\x15h,\x15\x04\x15\x00\x15\x06\x15\x08\x00\x009p\x02\x00\x00…

<IPython.core.display.Javascript object>

SerializationWidget(df_parquet=b'PAR1\x15\x00\x15<\x15@,\x15\x04\x15\x00\x15\x06\x15\x08\x00\x00\x1et\x02\x00\…

<IPython.core.display.Javascript object>

SerializationWidget(df_parquet=b'PAR1\x15\x00\x15\\\x15J,\x15\n\x15\x00\x15\x06\x15\x08\x00\x00.\x18\x02\x00\x…

<IPython.core.display.Javascript object>

SerializationWidget(df_parquet=b'PAR1\x15\x00\x15T\x15P,\x15\x04\x15\x00\x15\x06\x15\x08\x00\x00*L\x02\x00\x00…

<IPython.core.display.Javascript object>

SerializationWidget(df_parquet=b'PAR1\x15\x00\x15N\x15P,\x15\x04\x15\x00\x15\x06\x15\x08\x00\x00\'P\x02\x00\x0…

<IPython.core.display.Javascript object>

SerializationWidget(df_parquet=b'PAR1\x15\x00\x15Z\x15^,\x15\x06\x15\x00\x15\x06\x15\x08\x00\x00-\xb0\x02\x00\…

<IPython.core.display.Javascript object>

SerializationWidget(df_parquet=b'PAR1\x15\x04\x15*\x15.L\x15\x06\x15\x00\x00\x00\x15P\x03\x00\x00\x00"a"\x03\x…

Unnamed: 0,name,json,feather,fast_parquet_json,widget,parquet,fast_parquet,pickle
0,numeric_columns,True,True,error,False,True,error,True
1,bool_df,True,True,True,True,True,True,True
2,object_values,True,True,True,True,True,True,True
3,object_values_varied,True,False,True,True,False,True,True
4,date_values,False,True,True,False,True,True,True
5,odd_integer_values,False,True,True,False,True,True,True
6,lists_df,True,False,True,True,False,True,True
7,uneven_lists_df,True,False,True,True,False,True,True
8,multi_index_rows,False,True,error,False,True,error,True
9,multi_index_columns,False,True,error,False,True,error,True
