In [None]:
import timeit
import math
import json
import io
import pandas as pd
import buckaroo
import numpy as np
from buckaroo.serialization_utils import pd_to_obj

N = 500
NA = pd.NA
ROWS = 100_000
typed_df = pd.DataFrame({'int_col':np.random.randint(1,50, ROWS), 'float_col': np.random.randint(1,30, ROWS)/.7,
                         "str_col": ["foobar"]* ROWS,
                        'longtail_unique' : random_categorical({},      unique_per=0.5, na_per=.0, longtail_per=.5, N=N),

                        })

In [None]:
def rand_cat(named_p, na_per, N):
    choices, p = [], []
    named_total_per = sum(named_p.values()) + na_per
    total_len = int(np.floor(named_total_per * N))
    if named_total_per > 0:
        for k, v in named_p.items():
            choices.append(k)
            p.append(v/named_total_per)

        choices.append(pd.NA)
        p.append(na_per/named_total_per)    
        return [np.random.choice(choices, p=p) for k in range(total_len)]
    else:
        return []

def random_categorical(named_p, unique_per, na_per, longtail_per, N):
    choice_arr = rand_cat(named_p, na_per, N)
    discrete_choice_len = len(choice_arr)

    longtail_count = int(np.floor(longtail_per * N))//2
    extra_arr = []
    for i in range(longtail_count):
        extra_arr.append("long_%d" % i)
        extra_arr.append("long_%d" % i)

    unique_len = N - (len(extra_arr) + discrete_choice_len)
    #print("discrete_choice_len", discrete_choice_len, "longtail_count", longtail_count, "unique_len", unique_len)
    for i in range(unique_len):
        extra_arr.append("unique_%d" % i)
    all_arr = np.concatenate([choice_arr, extra_arr])
    np.random.shuffle(all_arr)
    return all_arr        
    
N = 500
NA = pd.NA
ROWS = 100_000
typed_df = pd.DataFrame({'int_col':np.random.randint(1,50, ROWS), 'float_col': np.random.randint(1,30, ROWS)/.7,
                        "str_col": ["foobar"]* ROWS,
                        "longtail_unique" : random_categorical({},
                            unique_per=0.5, na_per=.0, longtail_per=.5, N=ROWS),
                        })

In [None]:
def human_size_temporal(frac_seconds, order=None):
    precision = 3
    units = ["s", "ms",'us',"ns"]
    scaling = [1, 1e3, 1e6, 1e9]

    if order is None:
        if frac_seconds > 0.0 and frac_seconds < 1000.0:
            order = min(-int(math.floor(math.log10(frac_seconds)) // 3), 3)
        elif frac_seconds >= 1000.0:
            order = 0
        else:
            order = 3
    #print(" %.*g %s per loop" % (precision, frac_seconds * scaling[order],  units[order]))
    return (round(frac_seconds*scaling[order], precision), units[order], order)
def time_stats(stmt, number=29, repeat=5, order=None, **kwargs):
    """adapted from ipython magic timeit 
    https://github.com/jupyter/ipython-py3k/blob/17bd8dfb1da9940799c3c3e0e19af098f4f93a45/IPython/core/magic.py#L1741
    """
    times = timeit.repeat(stmt, number=number, repeat=repeat, globals=kwargs)
    time_arr = np.array(times) / number 
    np.mean(time_arr)
    best = np.min(time_arr)

    scaled, units, _order = human_size_temporal(best, order=order)
    print("%d loops, best of %d: %f %s per loop" % (number, repeat, scaled, units))
    return scaled

In [None]:
def arrow_write(df):
    f = io.BytesIO()
    df.to_feather(f)
def arrow_size(df):
    f = io.BytesIO()
    df.to_feather(f)
    f.seek(0)
    return len(f.read())
def run_timings(N, number=29):
    ltyped_df=typed_df[:N]
    exec_args = dict(N=N, pd_to_obj=pd_to_obj, typed_df=ltyped_df, arrow_write=arrow_write,
                     order=1, number=number)
    return dict(
            N=N,
            parquet_time=time_stats('typed_df.to_parquet()', **exec_args),
            parquet_size=len(ltyped_df.to_parquet()),
            arrow_time=time_stats('arrow_write(typed_df)', **exec_args),
            arrow_size=arrow_size(ltyped_df),
            pd_to_obj_time=time_stats('pd_to_obj(typed_df)', **exec_args),
            pd_to_obj_size=len(json.dumps(pd_to_obj(ltyped_df))))
time_df = pd.DataFrame([
    run_timings(100),
    run_timings(500),
    run_timings(1000),
    run_timings(5000),
    run_timings(10000),
    run_timings(20000, 10),
    run_timings(100_000, 5),
])
time_df['parq_speedup'] = time_df['pd_to_obj_time'] / time_df['parquet_time'] 
time_df['parq_per_byte'] = (time_df['parquet_time'] / time_df['parquet_size']) * 1_000_000
time_df['json_per_byte'] = (time_df['pd_to_obj_time'] / time_df['pd_to_obj_size']) * 1_000_000
buckaroo.BuckarooWidget(time_df)    

In [None]:
typed_df