In [None]:
"""
While pandas is gonna be an O(n) range query, the underlying data structure
is really fast (numpy arrays). [np_array > 5] should be many times faster than [obj.x > 5], but how many?
"""

In [13]:
import random
import sys
import time
import numpy as np
import pandas as pd
import polars as pl

In [14]:
class Thing:
    def __init__(self):
        self.x = random.random()
        self.y = random.random()
        self.z = random.random()

N_THINGS = 10**7

things = [Thing() for _ in range(N_THINGS)]


In [15]:
t0 = time.time()
df_pd = pd.DataFrame({'x': [t.x for t in things], 'y': [t.y for t in things], 'z': [t.z for t in things], 't': things})
t1 = time.time()
print('built df in', t1-t0)
print(sys.getsizeof(df_pd) / 10**6, 'MB')
df_pd.columns

built df in 14.840728044509888
640.000144 MB


Index(['x', 'y', 'z', 't'], dtype='object')

In [10]:
t0 = time.time()
ls = df_pd.query('x < 0.0001 and y < 1 and z > 0.5').t.to_list()
t1 = time.time()
print(t1-t0)
print(len(ls))

0.0071866512298583984
33


In [11]:
t0 = time.time()
[t for t in things if t.x<0.0001 and t.y < 1 and t.z > 0.5]
t1 = time.time()
print(t1-t0)
print(len(ls))

0.050518035888671875
33


In [16]:
t0 = time.time()
df_pl = pl.DataFrame({'x': [t.x for t in things], 'y': [t.y for t in things], 
                      'z': [t.z for t in things], 't': things})
t1 = time.time()
print('built polars df in', t1-t0)

built polars df in 2.7918601036071777


In [17]:
t0 = time.time()
ls = df_pl.select(
    pl.col("t").filter((pl.col('x') < 0.0001) & (pl.col('y') < 1.0) & (pl.col('z') > 0.5))
).t.to_list()
# ls = df_pl[df_pl['x'] < 0.0001].t.to_list()
t1 = time.time()
print(t1-t0)
print(len(ls))

0.0544133186340332
502


In [None]:
len(df_pd)