In [17]:
import tidypolars as tp
import polars as pl
from polars import col
import pandas as pd
import numpy as np
from timeit import timeit
import string

np.random.seed(123)

letters = np.asarray(list(string.ascii_lowercase))
letters = np.random.choice(letters, 10)

data_size = 1000000
rand_string_len = 200
rand_string = np.random.choice(letters, rand_string_len)
for i in range(3):
    rand_string = np.char.add(rand_string, np.random.choice(letters, rand_string_len))

tidypolars_df = tp.Tibble(
    a = np.random.choice(np.arange(20), data_size),
    b = np.random.choice(np.arange(20), data_size),
    c = np.random.choice(rand_string, data_size),
    d = np.random.choice(rand_string, data_size)
)
polars_df = tidypolars_df.to_polars()
pandas_df = polars_df.to_pandas()

tidypolars_join_df = tidypolars_df.distinct('c').mutate(id = tp.row_number())
polars_join_df = tidypolars_join_df.to_polars()
pandas_join_df = tidypolars_join_df.to_pandas()

In [18]:
names_list = ["".join(np.random.choice(np.asarray(list(string.ascii_lowercase)), 5)) for i in range(500)]

def make_pw_df():
    for i in range(100):
        df = tidypolars_pw_df = tp.Tibble(
            id = [i] * 500,
            name = np.random.choice(np.asarray(names_list), 500),
            value = np.random.choice(np.arange(20), 500)
        )
        if i == 0:
            out = df
        else:
            out = out.bind_rows(df)
    return out
        
tidypolars_pw_df = make_pw_df()
polars_pw_df = tidypolars_pw_df.to_polars()
pandas_pw_df = polars_pw_df.to_pandas()

In [22]:
median_x = tidypolars_df.summarize(avg = col('a').mean()).pull('avg')[0]

summarize_funcs = { 
    'arrange' : dict(
        tidypolars = lambda: tidypolars_df.arrange('a', 'c'),
        polars = lambda: polars_df.sort(['a', 'c']), 
        pandas = lambda: pandas_df.sort_values(by=['a', 'c'])
    ), 
    'case_when' : dict(
        tidypolars = lambda: tidypolars_df.mutate(x_case = tp.case_when(col('a') < median_x).then(1).when(col('a') >= median_x).then(2).otherwise(3)),
        polars = lambda: polars_df.with_columns(pl.when(col('a') < median_x).then(1).when(col('a') >= median_x).then(2).otherwise(3)), 
        pandas = lambda: pandas_df.assign(x_case = lambda x : np.where(x.a > median_x , 1, (np.where(x.a >= median_x, 2, 3))))
    ), 
    'distinct': dict(
        tidypolars = lambda: tidypolars_df.distinct('c'),
        polars = lambda: polars_df.select('c').distinct(),
        pandas = lambda: pandas_df[['c']].drop_duplicates()
    ), 
    'filter' : dict(
        tidypolars = lambda: tidypolars_df.filter(col('a') <= 7, col('c') == 'brkc'),
        polars = lambda: polars_df.filter((col('a') <= 7) & (col('c') == 'brkc')),
        pandas = lambda: pandas_df[(pandas_df.a <= 7) & (pandas_df.c == 'brkc')]
    ),
    'left_join' : dict(
        tidypolars = lambda: tidypolars_df.left_join(tidypolars_df.head(1000), on = 'c'),
        polars = lambda: polars_df.join(polars_df.slice(0, 1000),  on = 'c', how = 'left'), 
        pandas = lambda: pandas_df.set_index('c').join(pandas_df.head(1000).set_index('c'), on = 'c', how='left', rsuffix='right_')
    ), 
    'mutate' : dict(
        tidypolars = lambda: tidypolars_df.mutate(double_a = col('a') * 2, a_plus_b = col('a') + col('b')),
        polars = lambda: polars_df.with_columns([(col('a') * 2).alias('double_a'), (col('a') + col('b')).alias('a_plus_b')]), 
        pandas = lambda: pandas_df.assign(double_a = lambda x: x.a * 2, a_plus_b = lambda x : x.a + x.b)
        ), 
    'pivot_wider' : dict(
        tidypolars = lambda: tidypolars_pw_df.pivot_wider(names_from = 'name', values_from = 'value', values_fn = 'sum'),
        polars = lambda: polars_pw_df.pivot('value', 'id', 'name', 'sum'),
        pandas = lambda: pandas_pw_df.groupby(['name', 'id'], as_index = False)['value'].sum().pivot(index = "id", columns = "name", values = "value")
    ), 
    'summarize': dict(
        tidypolars = lambda: tidypolars_df.summarize(x = col('a').median(), by = 'c'),
        polars = lambda: polars_df.groupby('c').agg(col('a').median().alias('x')),
        pandas = lambda: pandas_df.groupby('c', as_index = False)['a'].median()
    ) 
}

In [23]:
def benchmark_me(d, num_tests):
    for i, (key, value) in enumerate(d.items()):
        if i == 0:
            out = tp.Tibble({key: [timeit(value, number = num_tests)]})
        else:
            step = tp.Tibble({key: [timeit(value, number = num_tests)]})
            out = out.bind_cols(step)
    return out.mutate((col(['tidypolars', 'polars', 'pandas']) * 1000).round(3).cast(pl.Float64))
    # return out

# Benchmark Results

In [24]:
for i, (key, value) in enumerate(summarize_funcs.items()):
    value = benchmark_me(value, num_tests = 5).mutate(func_tested = tp.lit(key)).relocate('func_tested')
    if i == 0:
        bench_df = value
    else:
        bench_df = bench_df.bind_rows(value)

bench_df.arrange('func_tested')

func_tested,tidypolars,polars,pandas
str,f64,f64,f64
"""arrange""",752.298,750.386,768.677
"""case_when""",134.716,135.721,84.105
"""distinct""",40.683,42.03,270.724
"""filter""",30.346,30.163,216.383
"""left_join""",889.414,900.966,2723.635
"""mutate""",15.976,8.513,78.746
"""pivot_wider""",40.915,42.768,144.66
"""summarize""",78.795,68.275,300.896


In [25]:
bench_df.arrange('func_tested').pipe(print)

shape: (8, 4)
┌─────────────┬────────────┬─────────┬──────────┐
│ func_tested ┆ tidypolars ┆ polars  ┆ pandas   │
│ ---         ┆ ---        ┆ ---     ┆ ---      │
│ str         ┆ f64        ┆ f64     ┆ f64      │
╞═════════════╪════════════╪═════════╪══════════╡
│ arrange     ┆ 752.298    ┆ 750.386 ┆ 768.677  │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ case_when   ┆ 134.716    ┆ 135.721 ┆ 84.105   │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ distinct    ┆ 40.683     ┆ 42.03   ┆ 270.724  │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ filter      ┆ 30.346     ┆ 30.163  ┆ 216.383  │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ left_join   ┆ 889.414    ┆ 900.966 ┆ 2723.635 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ mutate      ┆ 15.976     ┆ 8.513   ┆ 78.746   │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ pivot_wider ┆ 40.915     ┆ 42.768  ┆ 144.66   │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ summarize   ┆ 78.795     ┆ 68.275 