In [1]:
#!pip install -U /mystuff/wikipedia2/faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
#!pip install --upgrade faiss-gpu
!pip install faiss-cpu
#!pip install faiss-gpu-1.7.4_cuda11.4.tar.bz2


[0m

In [2]:
from faiss import write_index, read_index

#sentence_index = read_index("/mystuff/wikipedia2/wikipedia_202307.index")

In [5]:
import sqlite3
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()

def read_embeddings_from_db(db_path='embedding_vectors_512_head_bge_small_en.db'):
    # Connect to SQLite database
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    embedding_ids = []
    embedding_vectors = []

    try:
        # Query to fetch all stored embeddings
        cursor.execute("SELECT id, vector FROM embeddings ORDER BY id")
        rows = cursor.fetchall()

        # Convert blobs back to NumPy arrays
        for row in tqdm(rows):
            embedding_id, embedding_blob = row
            embedding_vector = np.frombuffer(embedding_blob, dtype=np.float32)
            embedding_ids.append(embedding_id)
            embedding_vectors.append(embedding_vector)
#            retrieved_embeddings[embedding_id] = embedding_vector

    except Exception as e:
        print(f"An error occurred while reading from the database: {e}")

    finally:
        # Close the connection
        conn.close()

    return embedding_ids, embedding_vectors

# Read embeddings from the SQLite database
embedding_ids, embedding_vectors = read_embeddings_from_db()


  0%|          | 0/6082515 [00:00<?, ?it/s]

In [6]:
all_embeddings = np.stack(embedding_vectors)

In [7]:
all_embeddings.shape

(6082515, 384)

## Choice of FAISS Index type

If you want to go crazy trying to understand all combinations of FAISS index parameters, I recommend getting deep into their docs at https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index. Or just search the internets for "faiss index selection" and other similar searches.

In the end, I just gave up reading about them too much and tried Flat index since it should give exact results. And somewhere I read it should work fine on a few million records. Which it did. Most likely because I used this for a Kaggle competition where I only had to search about 4k queries over those 6M records, so the tradeoff in full precision for running only 4k queries without real-time requirements was fine.

The other trial was with the clustering version "IVF65536_HNSW32,Flat", which also seemed to give quite fine results. But in practice I would now start with just trying "Flat" and going from there with some trials, rather than spend a week reading the docs and go crazy. But thats just me..

In [8]:
import faiss

d = all_embeddings.shape[1]
print(d)

#index = faiss.index_factory(d, "IVF65536_HNSW32,Flat")
index = faiss.index_factory(d, "Flat")


384


In [11]:
ngpus = faiss.get_num_gpus()
ngpus

0

## Train the FAISS index with embeddings to be indexed

In [15]:
%%time
#gpu_index.train(all_embeddings)
index.train(all_embeddings)


CPU times: user 83 µs, sys: 592 µs, total: 675 µs
Wall time: 5.5 ms


## Add Embeddings to FAISS Index

In [16]:
index.add(all_embeddings)

## Write FAISS Index to disk

In [17]:
index_path = "faiss_index_512_flat_small.index"
faiss.write_index(index, index_path)


## Try some search where I knew what doc ID's it should return


In [18]:
from sentence_transformers import SentenceTransformer

#embedding_model_path = "/mystuff/llm/gte-base"
#embedding_model_path = "/mystuff/llm/all-MiniLM-L12-v2"
embedding_model_path = "/mystuff/llm/bge-small-en"

embedding_model = SentenceTransformer(embedding_model_path, device='cuda')


In [19]:
q_embeddings = embedding_model.encode(["What is the definition of anarchism?"])


In [23]:
D, I = index.search(q_embeddings, 5)

In [24]:
I

array([[1054330, 1880579, 5809849, 2102743,       0]])

In [22]:
D

array([[18.58313 , 19.57069 , 20.886288]], dtype=float32)