In [60]:
%load_ext autoreload
%autoreload 2

In [68]:
# Cosine distance is the simplest operation!

%%writefile utils/cosine_dist.py
import numpy as np

def cos_dist(e1,e2):
    return 1 - np.dot(e1,e2) / np.sqrt(np.dot(e1,e1) * np.dot(e2, e2))

UsageError: Line magic function `%%writefile` not found.


In [None]:
from utils.cosine_dist import cos_dist

In [1]:
EMBEDDINGS_LEN = 1536  # Openai size

NUM_DOCS = 2*10**5

print(f'More than {4*NUM_DOCS*EMBEDDINGS_LEN / 10**6} MB will be stored in a table')


More than 1228.8 MB will be stored in a table


In [2]:
from pgvector.sqlalchemy import Vector
from sqlalchemy import create_engine, insert, select, text, Integer, String, Text
from sqlalchemy.orm import declarative_base, mapped_column, Session

engine = create_engine('postgresql+psycopg://postgres:mysecretpassword@localhost:2345/postgres')
with engine.connect() as conn:
    conn.execute(text('CREATE EXTENSION IF NOT EXISTS vector'))
    conn.commit()

Base = declarative_base()


class Document(Base):
    __tablename__ = 'document'
    
    id = mapped_column(Integer, primary_key=True)
    content = mapped_column(Text)
    embedding = mapped_column(Vector(EMBEDDINGS_LEN))


Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)
session = Session(engine)


In [3]:
list(
    session.execute(text("SELECT reltuples AS estimate FROM pg_class WHERE relname = 'document'"))
)

[(-1.0,)]

In [4]:
list(
    session.execute(text("SELECT count(*) from document"))
)

[(0,)]

In [5]:
# Will insert in batches
BATCH_SIZE =  10**3


In [6]:
import numpy as np

centers = np.random.rand(NUM_DOCS//BATCH_SIZE,  EMBEDDINGS_LEN, )  
centers.shape

(200, 1536)

In [9]:
error = np.random.randn(BATCH_SIZE, EMBEDDINGS_LEN) / 20
error.shape


(1000, 1536)

In [None]:
# from numba import njit
# import numpy as np
# import random

# @njit(parallel=True)
# def gen_random():
#     return np.random.rand(num_docs, EMBEDDINGS_LEN)

# generated_embeddings = gen_random()
# generated_embeddings.shape

In [7]:
session.rollback()

# Write docs to DB

In [10]:
from tqdm.notebook import trange, tqdm


for i in trange(centers.shape[0]):
    
    embeddings = (centers[i] + error).tolist()  # batch_size x EMBEDDINGS_LEN
    # print(embeddings.shape)
    
    # idx = np.arange(i*BATCH_SIZE, (i+1)*BATCH_SIZE)
    
    idx = list(range(i*BATCH_SIZE, (i+1)*BATCH_SIZE ))
    
    params = (
        dict(
            id=idx[j],
            content=f'some unique content', #{idx[j]}',
            embedding=embeddings[j]
        )
        for j in range(BATCH_SIZE)
    )
    session.bulk_insert_mappings(Document, params)
    
    session.flush()
session.commit()
    
    # session.add_all((
    #     Document(id=i*BATCH_SIZE + j, content=f'some unique content #{i*BATCH_SIZE + j}', embedding=embeddings[j, :])
    #     for j in range(BATCH_SIZE)
    # ))

  0%|          | 0/200 [00:00<?, ?it/s]

In [None]:
session.commit()

In [11]:
list(
    session.execute(text("SELECT reltuples AS estimate FROM pg_class WHERE relname = 'document'"))
)

[(200000.0,)]

In [12]:
list(
    session.execute(text("SELECT count(*) from document"))
)

[(200000,)]

# Search 

## Baseline

In [21]:
# Baseline communication time  - retrieve by IDs

x = random.randint(0, centers.shape[0])
print(x)

# session.scalars(select(Document).order_by(Document.embedding.cosine_distance(doc_X_embeddings)).limit(10))
def retrieve_by_ids():
    return session.query(Document).filter(Document.id.in_(range(x, x+10))).all()
%timeit retrieve_by_ids()
docs_found = retrieve_by_ids()
print([
    d.id
    for d in docs_found
])

192
9.58 ms ± 197 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
[192, 193, 194, 195, 196, 197, 198, 199, 200, 201]


## Search by cosine dist. (witbout index)

In [56]:
session.execute(text('DROP INDEX IF EXISTS my_index'))
session.commit()

In [69]:
import random
# from utils.cosine_dist import cos_dist

x = random.randint(0, centers.shape[0])
doc_X_embeddings = centers[x]

def search(doc_X_embeddings):
    return session.scalars(select(Document).order_by(Document.embedding.cosine_distance(doc_X_embeddings)).limit(10))

%timeit search(doc_X_embeddings)

docs_found = search(doc_X_embeddings)
print(f'For center #{x}, the closest docs found:')
print(tabulate([
    {'ID': d.id, 'Dist.': cos_dist(d.embedding, doc_X_embeddings)}
    for d in docs_found
], headers='keys'))


1.12 s ± 124 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
For center #58, the closest docs found:
   ID     Dist.
-----  --------
58516  0.996619
58877  0.996554
58825  0.996549
58206  0.996523
58686  0.996501
58852  0.996488
58968  0.996488
58887  0.996481
58791  0.996472
58699  0.996469


## Create index

In [24]:
%%time
from sqlalchemy import Index
index = Index('my_index', Document.embedding,
    postgresql_using='ivfflat',
    postgresql_with={'lists': 100},
    postgresql_ops={'embedding': 'vector_cosine_ops'}
)
index.create(engine)

## Search with index

In [59]:
%timeit search(doc_X_embeddings)
from tabulate import tabulate


docs_found = search(doc_X_embeddings)
print(f'For center #{x}, the closest docs found:')
print(tabulate([
    {'ID': d.id, 'Dist.': cos_dist(d.embedding, doc_X_embeddings)}
    for d in docs_found
], headers='keys'))

1.04 s ± 14.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
For center #181, the closest docs found:
    ID       Dist.
------  ----------
181516  0.0019814
181825  0.00197517
181206  0.00197166
181877  0.00197745
181686  0.00197224
181125  0.00197476
181968  0.00196963
181887  0.00197558
181561  0.0019677
181699  0.00197679
