In [None]:
"""
While pandas is gonna be an O(n) range query, the underlying data structure
is really fast (numpy arrays). [np_array > 5] should be many times faster than [obj.x > 5], but how many?
"""

In [43]:
import random
import sys
import time
import numpy as np
import pandas as pd
import polars as pl

In [3]:
class Thing:
    def __init__(self):
        self.x = random.random()

In [28]:
N_THINGS = 10**7
N_MATCHING = 10**2
thresh = N_MATCHING / N_THINGS


In [29]:
len(np.where(np_rands < thresh)[0])

99

In [42]:
def get_speedup(n_things=10**6, n_matching=10**1):
    # make things
    things = [Thing() for _ in range(N_THINGS)]
    np_rands = np.random.random(N_THINGS)
    
    # match them 
    t0 = time.time()
    np_result = np.where(np_rands < thresh)[0]
    t1 = time.time()
    obj_result = [t for t in things if t.x < thresh]
    t2 = time.time()
    np_time = t1-t0
    obj_time = t2-t1
    speedup = obj_time / np_time
    return np_time, obj_time, f'{speedup}x'

print(get_speedup())

(0.0064585208892822266, 0.4583597183227539, '70.9697663258149x')


In [34]:
class Thing:
    def __init__(self):
        self.x = random.random()
        self.y = random.random()

things = [Thing() for _ in range(N_THINGS)]

In [104]:
t0 = time.time()
df_pd = pd.DataFrame({'x': [t.x for t in things], 'y': [t.y for t in things], 't': things})
t1 = time.time()
print('built df in', t1-t0)
print(sys.getsizeof(df_pd) / 10**6, 'MB')
df_pd.columns

built df in 12.80781888961792
560.000144 MB


Index(['x', 'y', 't'], dtype='object')

In [95]:
t0 = time.time()
ls = df_pd[df_pd['x'] < 0.0001].t.to_list()
t1 = time.time()
print(t1-t0)
print(len(ls))

0.019664764404296875
1025


In [100]:
t0 = time.time()
df_pl = pl.DataFrame({'x': [t.x for t in things], 'y': [t.y for t in things], 't': things})
t1 = time.time()
print('built df in', t1-t0)

built df in 2.729921817779541


In [97]:
t0 = time.time()
ls = df_pl[df_pl['x'] < 0.0001].t.to_list()
t1 = time.time()
print(t1-t0)
print(len(ls))

0.07121968269348145
1025


560000144