In [2]:
import random
import time
import duckdb
import pandas as pd
import sys
from pympler.asizeof import asizeof


In [3]:
letters = ['qwertyuiopasdfghjklzxcvbnm']
class Thing:
    def __init__(self):
        self.x = random.random()
        self.y = random.random()
        #self.s = ''.join(random.choice(letters) for _ in range(5))

n_things = 10**7
things = [Thing() for _ in range(n_things)]

# 10^7: 2.3G

In [4]:
# connect to an in-memory database
con = duckdb.connect()

t0 = time.time()
# build a pandas df and insert that; this is ~10x faster than using executemany() to insert a long list of values.
df = pd.DataFrame({
    'x': [t.x for t in things],
    'y': [t.y for t in things],
    'obj_id': [id(t) for t in things],
    }
)

# convert the df into a DuckDB db
con.execute('DROP TABLE IF EXISTS my_table')
con.execute("CREATE TABLE my_table AS SELECT * FROM df")
del df
obj_lookup = {id(t): t for t in things}
t1 = time.time()
print(t1-t0)
# Ater: 2.8GB, so 500MB used
# 3.5GB, so another 600MB there.
# Roughly 1.2GB spent on index total.

19.40508484840393


In [6]:
con.execute('SELECT obj_id FROM my_table LIMIT 10')
res = con.fetchall()
print(res)

[(139947766431456,), (139946298726480,), (139946298726336,), (139946298724848,), (139946298724704,), (139946298724416,), (139946298726816,), (139946298726912,), (139946298725568,), (139946298725664,)]


In [7]:

n_runs = 10
for e in range(0, 8):
    thresh = 10**e / 10**7
    t_duck = 0
    t_ls = 0
    avg_len = 0
    for _ in range(n_runs):
        t0 = time.time()
        con.execute("SELECT obj_id FROM my_table where x <=1 and y <= {}".format(thresh))
        res = con.fetchall()
        objs = [obj_lookup[r[0]] for r in res]
        #objs = [ctypes.cast(r[0], ctypes.py_object).value for r in res]
        t1 = time.time()
        #objs_ls = [t for t in things if t.x<=1 and t.y <= thresh]
        t2 = time.time()
        t_duck += (t1-t0)/n_runs
        t_ls += (t2-t1) / n_runs
        avg_len += len(objs) / n_runs
    print(int(avg_len), t_duck)

1 0.002308559417724609
9 0.0035325527191162115
110 0.02782580852508545
1005 0.09927351474761963
9983 0.1311617136001587
100131 0.20639636516571044
1001995 0.8211596727371214
10000000 4.665687894821167


[<__main__.Thing at 0x7f1c9f649100>]