In [1]:
import tidypolars as tp
import polars as pl
from polars import col
import pandas as pd
import numpy as np
from timeit import timeit
import string

np.random.seed(123)

letters = np.asarray(list(string.ascii_lowercase))
letters = np.random.choice(letters, 10)

data_size = 1000000
# data_size = 1000
rand_string = np.random.choice(letters, data_size)
for i in range(3):
    rand_string = np.char.add(rand_string, np.random.choice(letters, data_size))

tidypolars_df = tp.Tibble(
    a = np.random.choice(np.arange(20), data_size),
    b = np.random.choice(np.arange(20), data_size),
    c = np.random.choice(rand_string, data_size),
    d = np.random.choice(rand_string, data_size)
)
polars_df = tidypolars_df.clone().to_polars()
pandas_df = polars_df.to_pandas()

In [38]:
pw_data_size = 100000
names_list = ["".join(np.random.choice(np.asarray(list(string.ascii_lowercase)), 10)) for i in range(100)]

tidypolars_pw_df = tp.Tibble(
    id = np.random.choice(np.arange(10), pw_data_size),
    name = [name for name in names_list for i in range(pw_data_size//len(names_list))], 
    value = np.random.choice(np.arange(20), pw_data_size)
)

polars_pw_df = tidypolars_pw_df.clone().to_polars()
pandas_pw_df = polars_pw_df.to_pandas()

In [22]:
median_x = tidypolars_df.summarize(avg = col('a').mean()).pull('avg')[0]

summarize_funcs = { 
    'arrange' : dict(
        tidypolars = lambda: tidypolars_df.arrange('a'),
        polars = lambda: polars_df.sort(col('a')), 
        pandas = lambda: pandas_df.sort_values(by=['a'])
    ), 
    'case_when' : dict(
        tidypolars = lambda: tidypolars_df.mutate(x_case = tp.case_when(col('a') < median_x).then(1).when(col('a') >= median_x).then(2).otherwise(3)),
        polars = lambda: polars_df.with_columns(pl.when(col('a') < median_x).then(1).when(col('a') >= median_x).then(2).otherwise(3)), 
        pandas = lambda: pandas_df.assign(x_case = lambda x : np.where(x.a > median_x , 1, (np.where(x.a >= median_x, 2, 3))))
    ), 
    'distinct': dict(
        tidypolars = lambda: tidypolars_df.distinct('a'),
        polars = lambda: polars_df.select('a').drop_duplicates(),
        pandas = lambda: pandas_df.a.drop_duplicates()
    ), 
    'filter' : dict(
        tidypolars = lambda: tidypolars_df.filter(col('a') <= 7, col('c') == 'brkc'),
        polars = lambda: polars_df.filter((col('a') <= 7) & (col('c') == 'brkc')),
        pandas = lambda: pandas_df[(pandas_df.a <= 7) & (pandas_df.c == 'brkc')]
    ),
    'full_join' : dict(
        tidypolars = lambda: tidypolars_df.full_join(tidypolars_df.head(1000), on = 'c'),
        polars = lambda: polars_df.join(polars_df.slice(0, 1000),  on = 'c', how = 'outer'), 
        pandas = lambda: pandas_df.set_index('c').join(pandas_df.head(1000).set_index('c'), on = 'c', how='outer', rsuffix='right_')
    ), 
    'inner_join' : dict(
        tidypolars = lambda: tidypolars_df.inner_join(tidypolars_df.head(1000), on = 'c'),
        polars = lambda: polars_df.join(polars_df.slice(0, 1000),  on = 'c', how = 'inner'), 
        pandas = lambda: pandas_df.set_index('c').join(pandas_df.head(1000).set_index('c'), on = 'c', how='inner', rsuffix='right_')
    ), 
    'left_join' : dict(
        tidypolars = lambda: tidypolars_df.left_join(tidypolars_df.head(1000), on = 'c'),
        polars = lambda: polars_df.join(polars_df.slice(0, 1000),  on = 'c', how = 'left'), 
        pandas = lambda: pandas_df.set_index('c').join(pandas_df.head(1000).set_index('c'), on = 'c', how='left', rsuffix='right_')
    ), 
    'mutate' : dict(
        tidypolars = lambda: tidypolars_df.mutate(double_a = col('a') * 2, a_plus_b = col('a') + col('b')),
        polars = lambda: polars_df.with_columns([(col('a') * 2).alias('double_a'), (col('a') + col('b')).alias('a_plus_b')]), 
        pandas = lambda: pandas_df.assign(double_a = lambda x: x.a * 2, a_plus_b = lambda x : x.a + x.b)
        ), 
    'pivot_wider' : dict(
        tidypolars = lambda: tidypolars_pw_df.pivot_wider(names_from = 'name', values_from = 'value', values_fill = 0, values_fn = 'sum'),
        polars = lambda: polars_pw_df.groupby(['id']).pivot(pivot_column='name', values_column='value').sum().fill_null(0),
        pandas = lambda: pandas_pw_df.groupby(['name', 'id'], as_index = False)['value'].sum().pivot(index = "id", columns = "name", values = "value").fillna(0)
    ), 
    'summarize': dict(
        tidypolars = lambda: tidypolars_df.summarize(x = col('a').median(), by = 'c'),
        polars = lambda: polars_df.groupby('c').agg(col('a').median().alias('x')),
        pandas = lambda: pandas_df.groupby('c', as_index = False)['a'].median()
    ) 
}


In [23]:
def benchmark_me(d, num_tests):
    for i, (key, value) in enumerate(d.items()):
        if i == 0:
            out = tp.Tibble({key: [timeit(value, number = num_tests)]})
        else:
            step = tp.Tibble({key: [timeit(value, number = num_tests)]})
            out = out.bind_cols(step)
    return out.mutate((tp.col_everything() * 1000).round(3).cast(pl.Float64))

# Benchmark Results

In [43]:
for i, (key, value) in enumerate(summarize_funcs.items()):
    value = benchmark_me(value, num_tests = 5).mutate(func_tested = tp.lit(key)).relocate('func_tested')
    if i == 0:
        bench_df = value
    else:
        bench_df = bench_df.bind_rows(value)
        
# print(bench_df.arrange('func_tested'))
bench_df.arrange('func_tested')

func_tested,tidypolars,polars,pandas
str,f64,f64,f64
"""arrange""",178.499,204.379,617.298
"""case_when""",90.423,78.705,152.579
"""distinct""",16.895,18.085,29.339
"""filter""",31.584,31.64,234.103
"""full_join""",231.754,242.483,1217.791
"""inner_join""",49.197,54.01,671.946
"""left_join""",143.866,143.215,1188.211
"""mutate""",10.004,8.946,120.789
"""pivot_wider""",42.707,42.015,52.247
"""summarize""",61.177,59.676,449.369
