In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Cosine similarity is an extremely simple algorithm


In [6]:
%%writefile utils/cosine_dist.py

# Cosine distance is the simplest operation!

import numpy as np

def cos_dist(e1,e2):
    return 1 - np.dot(e1,e2) / np.sqrt(np.dot(e1,e1) * np.dot(e2, e2))

Overwriting utils/cosine_dist.py


In [7]:
from utils.cosine_dist import cos_dist

# Run Postgres (docker container)

In [8]:
# !docker run -d  --cpus=1 --memory=2g --name pgvector-c -e POSTGRES_PASSWORD=mysecretpassword -p 2345:5432 postgres-with-pgvector

# Prepare data

In [9]:
EMBEDDINGS_LEN = 1536  # Openai size

NUM_DOCS = 10**5

print(f'Number of documents: {NUM_DOCS:,}')
print(f'More than {4*NUM_DOCS*EMBEDDINGS_LEN / 10**9: .2} GB  (size of embeddings only) will be stored in the table')


Number of documents: 100,000
More than  0.61 GB  (size of embeddings only) will be stored in the table


## Create data as several clusters (each doc = cluster center + gaussian error )

In [10]:
NUM_CLUSTERS =  10**2
CLUSTER_SIZE = int(NUM_DOCS / NUM_CLUSTERS)

import numpy as np

centers = np.random.rand(NUM_CLUSTERS,  EMBEDDINGS_LEN).astype(np.float32)
print('Num. clusters:', centers.shape[0])
errors = np.random.randn(CLUSTER_SIZE, EMBEDDINGS_LEN).astype(np.float32) / 20
print('Cluster size:', errors.shape[0])


Num. clusters: 100
Cluster size: 1000


In [11]:
from utils.batched import batched
print('Utility function, example of usage:')
for b in batched(range(95), batch_size=10): print('next batch:', b)

Utility function, example of usage:
next batch: (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
next batch: (10, 11, 12, 13, 14, 15, 16, 17, 18, 19)
next batch: (20, 21, 22, 23, 24, 25, 26, 27, 28, 29)
next batch: (30, 31, 32, 33, 34, 35, 36, 37, 38, 39)
next batch: (40, 41, 42, 43, 44, 45, 46, 47, 48, 49)
next batch: (50, 51, 52, 53, 54, 55, 56, 57, 58, 59)
next batch: (60, 61, 62, 63, 64, 65, 66, 67, 68, 69)
next batch: (70, 71, 72, 73, 74, 75, 76, 77, 78, 79)
next batch: (80, 81, 82, 83, 84, 85, 86, 87, 88, 89)
next batch: (90, 91, 92, 93, 94)


In [12]:

def gen_data():
    for i in range(NUM_CLUSTERS):
        embeddings = (centers[i] + errors)#.tolist()
        # idx = np.arange(i*CLUSTER_SIZE, (i+1)*CLUSTER_SIZE).astype(int)
        idx = list(range(i*CLUSTER_SIZE, (i+1)*CLUSTER_SIZE ))


        for j in range(CLUSTER_SIZE):
            yield dict(
                id=idx[j],
                content=f'some unique content #{idx[j]}',
                embedding=embeddings[j]
            )

print('Time for iterating over all docs:')
%time for _ in gen_data(): pass

Time for iterating over all docs:
CPU times: user 56.7 ms, sys: 36.9 ms, total: 93.6 ms
Wall time: 93.5 ms


# (Re)Create table

In [13]:
from pgvector.sqlalchemy import Vector
from sqlalchemy import create_engine, insert, select, text, Integer, String, Text
from sqlalchemy.orm import declarative_base, mapped_column, Session

engine = create_engine('postgresql+psycopg://postgres:mysecretpassword@localhost:2345/postgres')
with engine.connect() as conn:
    conn.execute(text('CREATE EXTENSION IF NOT EXISTS vector'))
    conn.commit()

Base = declarative_base()


class Document(Base):
    __tablename__ = 'document'
    
    id = mapped_column(Integer, primary_key=True)
    content = mapped_column(Text)
    embedding = mapped_column(Vector(EMBEDDINGS_LEN))


Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)


In [14]:
session = Session(engine)


In [15]:
def count_currently():
    print(list(
        session.execute(text("SELECT count(*) from document"))
    ))
count_currently()

[(0,)]


In [16]:

from typing import Iterable
# conn = engine.raw_connection()

def insert_via_copy(conn, table_name, records: Iterable[dict], columns: Iterable=[], commit=True):
    if not columns:
        records = list(records)
        columns = list(records[0].keys())
    try:
        with conn.cursor() as cursor:
            with cursor.copy(f"COPY {table_name} ({', '.join(columns)}) FROM STDIN") as copy:
                for record in records:
                    copy.write_row(tuple(record.values()))

        if commit:
            # print('comitting')
            conn.commit()

    except:
        conn.rollback()
        raise


# Write docs to DB

In [17]:
%%time
from tqdm import tqdm
from itertools import islice
from utils.numpy_to_json import FastJSONEncoder
import json 

batch_size = 100

limit = 1_000
limit = None


doc_list = islice(
    gen_data(),
    limit
)

conn = engine.raw_connection()

for batch in batched(
                  tqdm(
                      doc_list,
                      total=limit or NUM_DOCS,
                      desc='Write docs to db'
                    ),
              batch_size):
  
    for doc in batch:
      doc['embedding'] = json.dumps(doc['embedding'], cls=FastJSONEncoder)

    insert_via_copy(conn, Document.__tablename__, 
                records=batch, 
                columns=('id', 'content', 'embedding'),
                commit=False
                )



    

Write docs to db: 100%|██████████| 100000/100000 [00:28<00:00, 3519.30it/s]


CPU times: user 6.02 s, sys: 644 ms, total: 6.66 s
Wall time: 28.4 s


In [18]:
conn.commit()

In [14]:
count_currently()

[(100000,)]


# Search 

## Baseline

In [15]:
# Baseline communication time  - retrieve by IDs
import random

x = random.randint(0, centers.shape[0])
print(x)

# session.scalars(select(Document).order_by(Document.embedding.cosine_distance(doc_X_embeddings)).limit(10))
def retrieve_by_ids():
    return session.query(Document).filter(Document.id.in_(range(x, x+10))).all()
%timeit retrieve_by_ids()
docs_found = retrieve_by_ids()
print([
    d.id
    for d in docs_found
])

87
3.05 ms ± 181 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
[87, 88, 89, 90, 91, 92, 93, 94, 95, 96]


## Search by cosine dist. (witbout index)

In [19]:
print(
    session.execute(text('DROP INDEX IF EXISTS my_index'))
)
session.commit()

<sqlalchemy.engine.cursor.CursorResult object at 0x137fce5f0>


In [31]:
import random
from tabulate import tabulate
# from utils.cosine_dist import cos_dist


def search(doc_X_embeddings, limit=10):
    return session.scalars(select(Document).order_by(
        Document.embedding.cosine_distance(doc_X_embeddings)).limit(limit)
    )


for _ in range(3):
    x = random.randint(0, centers.shape[0]-1)
    doc_X_embeddings = centers[x]
        
    %timeit -n 3 search(doc_X_embeddings)

    docs_found = search(doc_X_embeddings)
    print(f'For center #{x}, the closest docs found:')
    print(tabulate([
        {'ID': d.id, 'Dist.': cos_dist(d.embedding, doc_X_embeddings)}
        for d in docs_found
    ], headers='keys'))


7.19 ms ± 4.6 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)
For center #57, the closest docs found:
   ID       Dist.
-----  ----------
57936  0.00327247
57210  0.00336128
57484  0.00336158
57508  0.00336242
57780  0.00336695
57803  0.00337636
57651  0.00337857
57171  0.00338113
57855  0.00338179
57847  0.00338322
3.87 ms ± 246 µs per loop (mean ± std. dev. of 7 runs, 3 loops each)
For center #87, the closest docs found:
   ID       Dist.
-----  ----------
87936  0.0032472
87484  0.00333744
87210  0.00333887
87780  0.00335813
87508  0.00336373
87171  0.00336623
87847  0.00336719
87855  0.00336879
87882  0.00337005
87576  0.00338191
3.45 ms ± 340 µs per loop (mean ± std. dev. of 7 runs, 3 loops each)
For center #99, the closest docs found:
   ID       Dist.
-----  ----------
99936  0.0032419
99171  0.00335532
99847  0.00335866
99855  0.00336528
99671  0.0033769
99591  0.00337923
99087  0.0033856
99949  0.00338745
99163  0.00338954
99205  0.00340569


## Create index

In [33]:
%%time

from sqlalchemy import Index

def recreate_index(expected_number_of_clusters=10):

    session.execute(text('DROP INDEX IF EXISTS my_index'))
    session.commit()
    index = Index('my_index', Document.embedding,
        postgresql_using='ivfflat',
        postgresql_with={'lists': expected_number_of_clusters},
        postgresql_ops={'embedding': 'vector_cosine_ops'}
    )
    index.create(engine)

recreate_index(expected_number_of_clusters=100)

CPU times: user 2.89 ms, sys: 2.76 ms, total: 5.65 ms
Wall time: 9.03 s


## Search with index

In [34]:
from tabulate import tabulate
import random

for _ in range(3):
    x = random.randint(0, centers.shape[0]-1)
    doc_X_embeddings = centers[x]
    %timeit search(doc_X_embeddings)
    docs_found = search(doc_X_embeddings)
    print(f'For a center #{x}, the closest docs found:')
    print(tabulate([
        {'ID': d.id, 'Dist.': cos_dist(d.embedding, doc_X_embeddings)}
        for d in docs_found
    ], headers='keys'))

3.04 ms ± 153 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
For a center #98, the closest docs found:
   ID       Dist.
-----  ----------
98780  0.0034411
98814  0.00344491
98882  0.00344986
98847  0.00345266
98163  0.00346398
98499  0.00347525
98507  0.00348258
98305  0.00349295
98385  0.00350589
98174  0.0035128
4.05 ms ± 355 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
For a center #54, the closest docs found:
   ID       Dist.
-----  ----------
54936  0.00330442
54210  0.00338912
54484  0.00340271
54508  0.00340605
54882  0.00341082
54171  0.00341249
54780  0.00341249
54803  0.00342011
54847  0.00342065
54314  0.0034312
110 ms ± 1.74 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
For a center #22, the closest docs found:
   ID       Dist.
-----  ----------
22936  0.00334799
22210  0.00343323
22484  0.00343531
22780  0.00344211
22171  0.00344431
22847  0.00344867
22508  0.00345004
22576  0.0034557
22803  0.00345761
22882  0.00345981


## Create index with wrong number of expected clusters

In [35]:
%%time

# 10 is less than real number of clusters, so search will be less effective
# both in terms of time, and recall rate will decrease
recreate_index(expected_number_of_clusters=10)  

CPU times: user 2.32 ms, sys: 2.27 ms, total: 4.59 ms
Wall time: 5.51 s


## Search with non-optimal index

In [38]:
from tabulate import tabulate
import random

for _ in range(3):
    x = random.randint(0, centers.shape[0]-1)
    doc_X_embeddings = centers[x]
    %timeit search(doc_X_embeddings)
    docs_found = search(doc_X_embeddings)
    print(f'For a center #{x}, the closest docs found:')
    print(tabulate([
        {'ID': d.id, 'Dist.': cos_dist(d.embedding, doc_X_embeddings)}
        for d in docs_found
    ], headers='keys'))

181 ms ± 2.3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
For a center #52, the closest docs found:
   ID       Dist.
-----  ----------
52936  0.00331742
52210  0.00339168
52780  0.00339967
52484  0.00340068
52508  0.00341493
52847  0.00341809
52803  0.00342214
52814  0.00342488
52576  0.0034284
52507  0.00343168
181 ms ± 2.56 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
For a center #72, the closest docs found:
   ID       Dist.
-----  ----------
72936  0.00323367
72803  0.00331903
72484  0.00332761
72210  0.00332987
72780  0.00333238
72847  0.00333828
72508  0.00334734
72576  0.00335163
72882  0.00335211
72814  0.00335699
181 ms ± 2.97 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
For a center #53, the closest docs found:
   ID       Dist.
-----  ----------
53936  0.00326687
53210  0.00336123
53484  0.0033676
53780  0.00337124
53508  0.0033744
53882  0.00338584
53847  0.00338614
53576  0.00338781
53171  0.00338817
53803  0.00339353
