Let's test all operations in Pandas to check if it can compete with the sqlite implementation.

Object:
x: float, y: float, s: string

DataFrame Columns:
x, y, s, obj_id, obj
where obj_id is a unique index

Update: Nope, using an index to look up numbers is very bad. Building an indexed df of 10k items took > 1 second. 
Let's just resign ourselves to:
 - O(n) remove, maybe with a mark-and-sweep gc
 - O(n) update, since we have to find the obj_id.

O(n) update looking good! <1ms on 1M item set.

In [37]:
import random
import time
import pandas as pd
import sys
from pympler.asizeof import asizeof


In [38]:
letters = ['qwertyuiopasdfghjklzxcvbnm']
class Thing:
    def __init__(self):
        self.x = random.random()
        self.y = random.random()
        #self.s = ''.join(random.choice(letters) for _ in range(5))

n_things = 10**7
things = [Thing() for _ in range(n_things)]

# 10^7: 2.3G

In [39]:
t0 = time.time()
df = pd.DataFrame({
        'x': [t.x for t in things],
        'y': [t.y for t in things],
        't': [t for t in things],
        'obj_id': [id(t) for t in things],
    }
)
t1 = time.time()
print(t1-t0)
# 10^7: 2.6G, so 300MB. Nice!

15.85633659362793


In [4]:
asizeof(df) / n_things

128.0008584

In [5]:
"""
# 128 bytes / obj before
df = df.set_index('obj_id')
# 362 bytes / obj after, ouch.

n_lookups = 10**3
t0 = time.time()
for _ in range(n_lookups):
    oid = id(random.choice(things))
    df.loc[oid]
t1 = time.time()
print(t1-t0)
"""
# sub-millisecond find by id. That's good!

"\n# 128 bytes / obj before\ndf = df.set_index('obj_id')\n# 362 bytes / obj after, ouch.\n\nn_lookups = 10**3\nt0 = time.time()\nfor _ in range(n_lookups):\n    oid = id(random.choice(things))\n    df.loc[oid]\nt1 = time.time()\nprint(t1-t0)\n"

In [6]:
# no-index lookup by id
n_runs = 10**3
t0 = time.time()
for _ in range(n_runs):
    t = random.choice(things)
    df[df['obj_id'] == id(t)]
t1 = time.time()
print((t1-t0)/n_runs)
t = random.choice(things)
df[df['obj_id'] == id(t)]

# still sub-ms lookup at 1M items, very nice!

0.005534781932830811


Unnamed: 0,x,y,t,obj_id
3715099,0.806781,0.854579,<__main__.Thing object at 0x7fc1889f81c0>,140469197570496


In [7]:
n_runs = 5
for e in range(0,8):
    n_finds = 10**e
    thresh = n_finds/10**7
    t_run = 0
    for _ in range(n_runs):
        t0 = time.time()
        ls = df.query('x <= {}'.format(thresh))['t'].to_list()
        t1 = time.time()
        t_run += (t1-t0)/n_runs
    print(n_finds, t_run)

# 3.5ms query @ 1M, 38ms @ 10M

1 0.035682296752929686
10 0.03392667770385742
100 0.03681635856628418
1000 0.035766983032226564
10000 0.04104337692260743
100000 0.06929659843444824
1000000 0.20270094871520997
10000000 0.6384607315063476


In [40]:
from rangeindex import RangeIndex
ri = RangeIndex({'x':float, 'y': float}, things, 'pandas')

In [43]:
t0 = time.time()
n_runs = 5
for e in range(0,8):
    n_finds = 10**e
    thresh = n_finds/10**7
    t_run = 0
    t_gen = 0
    for _ in range(n_runs):
        t0 = time.time()
        ls = ri.find('x <= {}'.format(thresh))
        t1 = time.time()
        g = [o for o in things if o.x <= thresh]
        t2 = time.time()
        t_run += (t1-t0)/n_runs
        t_gen += (t2-t1)/n_runs
    print(n_finds, t_run, t_gen, t_gen/t_run)

1 0.034227180480957034 0.47257580757141116 13.807032917430579
10 0.016726589202880858 0.4675947666168213 27.955177289598673
100 0.0176180362701416 0.4690868854522705 26.625378575662356
1000 0.016965293884277345 0.4678145408630371 27.574797351231627
10000 0.02004523277282715 0.4715585708618164 23.524724117998282
100000 0.04016070365905761 0.4818202018737793 11.99730477743042
1000000 0.1439764976501465 0.5277787685394287 3.6657286234444784
10000000 0.5199972152709961 0.6394063472747802 1.229634175909104


In [52]:
import random

import time
from rangeindex import RangeIndex

class Object:
    def __init__(self):
        self.size = random.random()
        self.x = random.random()
        self.y = random.random()
        self.s = random.choice(list('qwertyuiopasdfghjklzxcvbnm'))

objects = [Object() for _ in range(10**6)]
ri = RangeIndex({'x': float}, objects, engine)


In [98]:

ri = RangeIndex({'x': float}, objects, engine)
for engine in ['pandas']:
    for thresh in [1, 1, 1, 1, 1, 0.1, 0.001, 0.0001, 0.0001, 0.0001, 0.000001]:
        #found = ri.find(f"x > 1000")  # Doing this nonsense query speeds up all later queries by ~2X. (???)
        #found = ri.find(f"x <= {thresh}") 
        t0 = time.time()
        found = ri.find(f"x <= {thresh}")
        t1 = time.time()
        found_2 = [o for o in objects if o.x <= thresh]
        t2 = time.time()

        print('===', engine, thresh, '===')
        print(len(found), len(found_2))
        print(t2-t1, t1-t0)
        print('{}x speedup'.format( round((t2-t1)/(t1-t0), 3) ))



=== pandas 1 ===
1000000 1000000
0.07951235771179199 0.04459095001220703
1.783x speedup
=== pandas 1 ===
1000000 1000000
0.0876920223236084 0.043865203857421875
1.999x speedup
=== pandas 1 ===
1000000 1000000
0.08917713165283203 0.04437971115112305
2.009x speedup
=== pandas 1 ===
1000000 1000000
0.08647036552429199 0.0444331169128418
1.946x speedup
=== pandas 1 ===
1000000 1000000
0.09085822105407715 0.044862985610961914
2.025x speedup
=== pandas 0.1 ===
99502 99502
0.06303596496582031 0.019763708114624023
3.189x speedup
=== pandas 0.001 ===
963 963
0.05473589897155762 0.004719734191894531
11.597x speedup
=== pandas 0.0001 ===
110 110
0.05023837089538574 0.003261089324951172
15.405x speedup
=== pandas 0.0001 ===
110 110
0.05178213119506836 0.00337982177734375
15.321x speedup
=== pandas 0.0001 ===
110 110
0.04863905906677246 0.0034329891204833984
14.168x speedup
=== pandas 1e-06 ===
1 1
0.0482487678527832 0.003052234649658203
15.808x speedup


In [74]:

for engine in ['pandas']:
    for thresh in [0.1, 0.01, 0.001, 0.0001]:
        t0 = time.time()
        found = ri.find(f"x <= {thresh}")
        t1 = time.time()
        found_2 = [o for o in objects if o.x <= thresh]
        t2 = time.time()
            

        print('===', engine, thresh, '===')
        print(len(found), len(found_2))
        print('{}x speedup'.format( (t2-t1)/(t1-t0) ))



=== pandas 0.1 ===
99502 99502
2.0575451692255493x speedup
=== pandas 0.01 ===
9949 9949
8.8298308353759x speedup
=== pandas 0.001 ===
963 963
14.150432616675406x speedup
=== pandas 0.0001 ===
110 110
15.189874342489771x speedup


In [42]:
len(ri) / 10**7

1.0

In [21]:
n_runs = 10**2
t0 = time.time()
for _ in range(n_runs):
    t = random.choice(things)
    idx = df[df['obj_id'] == id(t)].index[0]
    df.at[idx,'x'] = 12
    t.x = 12
t1 = time.time()
print((t1-t0)/n_runs)
df.loc[idx]

# 5ms update on one of 10M items

0.010246312618255616


x                                              12.0
y                                          0.717548
t         <__main__.Thing object at 0x7f421c1123d0>
obj_id                              139921915454416
Name: 8001709, dtype: object

In [22]:
print(len(df))
t0=time.time()
df.drop(idx, inplace=True)
t1 = time.time()
print(len(df))
print(t1-t0)

9999997
9999996
0.6332800388336182


In [18]:
t1-t0

0.6305458545684814