In [3]:
import random
import time
import pandas as pd
import duckdb
import sys
from pympler.asizeof import asizeof


In [4]:
letters = ['qwertyuiopasdfghjklzxcvbnm']
class Thing:
    def __init__(self):
        self.c0 = random.random()
        self.c1 = random.random()
        self.c2 = random.random()
        self.c3 = random.random()
        self.c4 = random.random()
        self.c5 = random.random()
        self.c6 = random.random()
        self.c7 = random.random()
        self.c8 = random.random()
        self.c9 = random.random()

n_things = 10**7
things = [Thing() for _ in range(n_things)]

# 10^7: 5.1G

In [5]:
con = duckdb.connect(database=':memory:')

In [12]:
t0 = time.time()
obj_lookup = {id(t): t for t in things}
t1 = time.time()
print('build dict', t1-t0)
# 5.7GB, so 600MB

build dict 3.1430535316467285


In [7]:

t0 = time.time()
# build a pandas df and insert that; this is ~10x faster than using executemany() to insert a long list of values.
df = pd.DataFrame({
    'c0': [t.c0 for t in things],
    'c1': [t.c1 for t in things],
    'c2': [t.c2 for t in things],
    'c3': [t.c3 for t in things],
    'c4': [t.c4 for t in things],
    'c5': [t.c5 for t in things],
    'c6': [t.c6 for t in things],
    'c7': [t.c7 for t in things],
    'c8': [t.c8 for t in things],
    'c9': [t.c9 for t in things],
    'obj_id': [id(t) for t in things],
    }
)

# convert the df into a DuckDB db
con.execute('DROP TABLE IF EXISTS my_table')
con.execute("CREATE TABLE my_table AS SELECT * FROM df")
t1 = time.time()
del df
print(t1-t0)
# Ater: 7.6GB, so another 1.9GB used, total 2.5GB used by index

34.5368390083313


In [11]:

n_runs = 10
for thresh in [0.15, 0.25, 0.33, 0.4, 0.51, 0.64, 0.8, 1.0]:
    t_duck = 0
    t_ls = 0
    avg_len = 0
    for _ in range(n_runs):
        t0 = time.time()
        con.execute(f"""SELECT obj_id FROM my_table where 
                    c0 <= {thresh} and 
                    c1 <= {thresh} and  
                    c2 <= {thresh} and 
                    c3 <= {thresh} and  
                    c4 <= {thresh} and 
                    c5 <= {thresh} and  
                    c6 <= {thresh} and 
                    c7 <= {thresh} and  
                    c8 <= {thresh} and 
                    c9 <= {thresh}
                    """)
        res = con.fetchall()
        # objs = df[df.obj_id.isin(obj_ids)].t.to_list()
        objs = [obj_lookup[r[0]] for r in res]
        #objs = [ctypes.cast(r[0], ctypes.py_object).value for r in res]
        t1 = time.time()
        """
        objs_ls = [t for t in things if 
                   t.c0 <= thresh and 
                   t.c1 <= thresh and
                   t.c2 <= thresh and 
                   t.c3 <= thresh and
                   t.c4 <= thresh and 
                   t.c5 <= thresh and
                   t.c6 <= thresh and 
                   t.c7 <= thresh and
                   t.c8 <= thresh and 
                   t.c9 <= thresh]
        """
        t2 = time.time()
        t_duck += (t1-t0)/n_runs
        t_ls += (t2-t1) / n_runs
        avg_len += len(objs) / n_runs
    print(thresh, int(avg_len), t_duck)

0.15 0 0.22828056812286376
0.25 10 0.24684977531433105
0.33 154 0.2692765474319458
0.4 1112 0.29678959846496583
0.51 11877 0.3806330680847168
0.64 115226 0.526672887802124
0.8 1074704 1.1882512807846068
1.0 10000000 4.489729809761047


In [34]:
sum(0.9*10**-e for e in range(10))

0.9999999999