# Load Modules

In [31]:
import numpy as np
import pandas as pd
import polars as pl
import gzip
import matplotlib.pyplot as plt
from scipy.special import expit

# Generate Data

In [2]:
np.random.seed(42)

n=1000
data = {
    "var1": np.random.random(n),
    "cat1": [f"c{i}" for i in np.random.randint(1, np.random.randint(2, 5), n)],
    "cat2": [f"c{i}" for i in np.random.randint(1, np.random.randint(2, 5), n)]
}

In [3]:
pddf = pd.DataFrame(data)
pldf = pl.DataFrame(data)

# Benchmarks

## Loading Data

In [4]:
%%timeit
pddf = pd.DataFrame(data)

233 µs ± 5.45 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [5]:
%%timeit
pldf = pl.DataFrame(data)

93.2 µs ± 183 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


## Writing CSV

In [6]:
%%timeit
pddf.to_csv("data/pandas_frame.tab", sep="\t")

1.99 ms ± 4.38 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [7]:
%%timeit
pldf.to_csv("data/polars_frame.tab", sep="\t")

226 µs ± 14.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## Writing GZIP CSV

In [33]:
%%timeit
pddf.to_csv("data/pandas_frame.tab.gz", sep="\t")

11.6 ms ± 110 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [34]:
%%timeit
pldf.to_csv(gzip.open("data/polars_frame.tab.gz", "w+"), sep="\t")

1.75 ms ± 6.29 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## Reading CSV

In [8]:
%%timeit
pd.read_csv("data/pandas_frame.tab", sep="\t")

916 µs ± 14.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [9]:
%%timeit
pl.read_csv("data/polars_frame.tab", sep="\t")

184 µs ± 1.5 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


## Reading GZIP CSV

In [35]:
%%timeit
pd.read_csv("data/pandas_frame.tab.gz", sep="\t")

1.79 ms ± 6.54 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [36]:
%%timeit
pl.read_csv(gzip.open("data/polars_frame.tab.gz", "r"), sep="\t")

326 µs ± 5.06 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## Get Unique

In [10]:
%%timeit
pddf.cat1.unique()

52.2 µs ± 429 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [11]:
%%timeit
pldf.cat1.unique()

18.1 µs ± 42.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## Series Operations

In [12]:
%%timeit
pddf.var1.mean()

42.8 µs ± 148 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [13]:
%%timeit
pldf.var1.mean()

1.11 µs ± 3.19 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


## Pivoting

In [14]:
%%timeit
piv_pd = pddf.pivot_table(index="cat1", columns="cat2", values="var1")

3.11 ms ± 12.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [15]:
%%timeit
piv_pl = pldf.groupby("cat1").pivot(pivot_column="cat2", values_column="var1").first()

214 µs ± 9.11 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [16]:
piv_pd = pddf.pivot_table(index="cat1", columns="cat2", values="var1")
piv_pl = pldf.groupby("cat1").pivot(pivot_column="cat2", values_column="var1").first()

## Column Assignment

In [17]:
%%timeit
pddf["test_a"] = pddf["var1"] * 100
pddf["test_b"] = pddf["var1"] * 10
pddf["test_c"] = pddf["var1"] * 0.1

334 µs ± 1.03 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [18]:
%%timeit
pldf.with_columns([
    (pl.col("var1") * 100).alias("test_a"),
    (pl.col("var1") * 10).alias("test_b"),
    (pl.col("var1") * 0.1).alias("test_c")
])

67.9 µs ± 1.34 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


## Filtering

In [19]:
%%timeit
pddf[
    (pddf["cat1"] == "c3") &
    (pddf["cat2"] == "c1")
]

390 µs ± 1.89 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [20]:
%%timeit
pldf.filter(
    (pl.col("cat1") == "c3") &
    (pl.col("cat2") == "c1")
)

101 µs ± 1.19 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


## Groupby

In [21]:
%%timeit
pddf.groupby(["cat1", "cat2"]).agg({"var1": "mean"})

952 µs ± 8.47 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [22]:
%%timeit
pldf.groupby(["cat1", "cat2"]).agg({"var1": "mean"})

159 µs ± 3.94 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


## Groupby Arbitrary Function

In [23]:
%%timeit
pddf.groupby(["cat1", "cat2"])\
    .apply(lambda x: expit(x.var1).sum())

1.25 ms ± 44.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [24]:
%%timeit
pldf.groupby(["cat1", "cat2"])\
    .agg(expit(pl.col("var1")).sum())

322 µs ± 22.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## Fold Operations

In [25]:
%%timeit
piv_pd.sum(axis=0)

212 µs ± 6.11 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [26]:
%%timeit
piv_pd.sum(axis=1)

84 µs ± 2.24 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [27]:
%%timeit
piv_pl.sum(axis=0)

11 µs ± 87.3 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [28]:
%%timeit
piv_pl.sum(axis=1)

1.51 µs ± 17 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
