In [2]:
import tidypolars as tp
import polars as pl
from polars import col
import pandas as pd
import numpy as np
from timeit import timeit
import string

np.random.seed(123)

letters = np.asarray(list(string.ascii_lowercase))
letters = np.random.choice(letters, 10)

data_size = 1000000
# data_size = 1000
rand_string = np.random.choice(letters, data_size)
for i in range(3):
    rand_string = np.char.add(rand_string, np.random.choice(letters, data_size))

tidypolars_df = tp.Tibble(
    a = np.random.choice(np.arange(20), data_size),
    b = np.random.choice(np.arange(20), data_size),
    c = np.random.choice(rand_string, data_size),
    d = np.random.choice(rand_string, data_size)
)
polars_df = tidypolars_df.clone().to_polars()
pandas_df = polars_df.to_pandas()

In [3]:
def benchmark_me(d, num_tests):
    for i, (key, value) in enumerate(d.items()):
        if i == 0:
            out = tp.Tibble({key: [timeit(value, number = num_tests)]})
        else:
            step = tp.Tibble({key: [timeit(value, number = num_tests)]})
            out = out.bind_cols(step)
    return out.mutate((tp.col_everything() * 1000).round(3).cast(pl.Float64))

In [16]:
median_x = tidypolars_df.summarize(avg = col('a').mean()).pull('avg')[0]

summarize_funcs = { 
    'filter' : dict(
        tidypolars = lambda: tidypolars_df.filter(col('a') <= 7, col('c') == 'brkc'),
        polars = lambda: polars_df.filter((col('a') <= 7) & (col('c') == 'brkc')),
        pandas = lambda: pandas_df[(pandas_df.a <= 7) & (pandas_df.c == 'brkc')]
    ),
    'summarize': dict(
        tidypolars = lambda: tidypolars_df.summarize(median = col('a').median(), groupby = 'c'),
        polars = lambda: polars_df.groupby('c').agg(col('a').median().alias('median')),
        pandas = lambda: pandas_df.groupby('c', as_index = False)['a'].median()
    ), 
    'mutate' : dict(
        tidypolars = lambda: tidypolars_df.mutate(double_a = col('a') * 2, a_plus_b = col('a') + col('b')),
        polars = lambda: polars_df.with_columns([(col('a') * 2).alias('double_a'), (col('a') + col('b')).alias('a_plus_b')]), 
        pandas = lambda: pandas_df.assign(double_a = lambda x: x.a * 2, a_plus_b = lambda x : x.a + x.b)
        ), 
    'case_when' : dict(
        tidypolars = lambda: tidypolars_df.mutate(x_case = tp.case_when(col('a') < median_x).then(1).when(col('a') >= median_x).then(2).otherwise(3)),
        polars = lambda: polars_df.with_columns(pl.when(col('a') < median_x).then(1).when(col('a') >= median_x).then(2).otherwise(3)), 
        pandas = lambda: pandas_df.assign(x_case = lambda x : np.where(x.a > median_x , 1, (np.where(x.a >= median_x, 2, 3))))
    )
}


# Summarize Benchmark Tests

In [22]:
for i, (key, value) in enumerate(summarize_funcs.items()):
    value = benchmark_me(value, num_tests = 5).mutate(func_tested = tp.lit(key)).relocate('func_tested')
    if i == 0:
        bench_df = value
    else:
        bench_df = bench_df.bind_rows(value)
print(bench_df)

shape: (4, 4)
┌─────────────┬────────────┬────────┬─────────┐
│ func_tested ┆ tidypolars ┆ polars ┆ pandas  │
│ ---         ┆ ---        ┆ ---    ┆ ---     │
│ str         ┆ f64        ┆ f64    ┆ f64     │
╞═════════════╪════════════╪════════╪═════════╡
│ filter      ┆ 43.502     ┆ 37.071 ┆ 256.945 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ summarize   ┆ 60.524     ┆ 56.616 ┆ 485.196 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ mutate      ┆ 8.171      ┆ 8.091  ┆ 140.072 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ case_when   ┆ 90.27      ┆ 84.132 ┆ 133.707 │
└─────────────┴────────────┴────────┴─────────┘


In [40]:
median_x = tidypolars_df.summarize(avg = col('a').median()).pull('avg')[0]
median_x

# tidypolars_df.mutate(x_case = tp.case_when(col('a') < median_x).then(1).when(col('a') >= median_x).then(2).otherwise(3))
pandas_df.assign(x_case = lambda x : np.where(x.a > median_x , 1, (np.where(x.a >= median_x, 2, 3))))  #(2 if x['a'] < 20 else 3))

# pandas_df.assign(double_a = lambda x: x.a * 2, a_plus_b = lambda x : x.a + 10)

Unnamed: 0,a,b,c,d,x_case
0,19,3,cbzg,wntz,1
1,7,3,brnw,rgkb,3
2,17,9,ztzb,tbcg,1
3,1,10,cncb,tncr,3
4,4,13,cbcb,tccc,3
...,...,...,...,...,...
999995,17,16,zwgg,wcrz,1
999996,15,1,nztr,cgwn,1
999997,18,0,ngbg,bcnr,1
999998,7,14,nrtb,bzzb,3
