This notebook assumes a database of document chunks already exists. From those chunks it loads the first chunk of each document and creates an embdding for that chunk. Later I used a separate notebook to index all these embeddings into a vector database (FAISS in this case). This was simply a separate notebook and seaprate database to make it easier to experiment with creating the embeddings, storing them generally, and then trying them with different vector database implementations.

In [1]:
from sentence_transformers import SentenceTransformer

#embedding_model_path = "/mystuff/llm/gte-base"
#embedding_model_path = "/mystuff/llm/all-MiniLM-L12-v2"
#embedding_model_path = "/mystuff/llm/bge-small-en"
embedding_model_path = "/mystuff/llm/bge-base-en"

embedding_model = SentenceTransformer(embedding_model_path, device='cuda')


Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.


In [2]:
def embed_chunks(chunks_to_embed):
    np_embeddings = embedding_model.encode(chunks_to_embed)
    doc_embeddings = np_embeddings
    return doc_embeddings

In [3]:
import sqlite3
from tqdm.notebook import tqdm
tqdm.pandas()

def fetch_documents(database_path):
    connection = sqlite3.connect(database_path)
    cursor = connection.cursor()

    cursor.execute("SELECT DISTINCT document_id FROM documents ORDER BY document_id;")
    document_ids = cursor.fetchall()

    for doc_id in document_ids:
        doc_id = doc_id[0]  # Unpack the tuple

        cursor.execute("""
            SELECT 
                d.document_id, d.document_title,
                s.section_id, s.section_title,
                tc.chunk_id, tc.content
            FROM documents d
            LEFT JOIN sections s ON s.document_id = d.document_id
            LEFT JOIN text_chunks tc ON tc.document_id = d.document_id AND tc.section_id = s.section_id
            WHERE d.document_id = ?
            ORDER BY d.document_id, s.section_id, tc.chunk_id;
        """, (doc_id,))

        document = {
            'id': doc_id,
            'title': None,
            'sections': {}
        }

        for row in cursor:
            document_id, document_title, section_id, section_title, chunk_id, content = row

            # Set the document title
            document['title'] = document_title

            # Add section if not already present
            if section_id and section_id not in document['sections']:
                document['sections'][section_id] = {
                    'title': section_title,
                    'chunks': {}
                }

            # Add chunk
            if chunk_id:
                document['sections'][section_id]['chunks'][chunk_id] = content

        yield document  # Yield the document for processing

    connection.close()

In [4]:
#embeddings.keys()

In [5]:
import sqlite3

def get_last_inserted_id(db_path='embedding_vectors_256_head.db'):
    # Connect to SQLite database
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    last_id = None

    try:
        # Query to fetch the maximum ID value
        cursor.execute("SELECT MAX(id) FROM embeddings")
        last_id = cursor.fetchone()[0]

    except Exception as e:
        print(f"An error occurred while reading from the database: {e}")
        return 0

    finally:
        # Close the connection
        conn.close()

    return last_id

# Get the ID of the last inserted embedding
last_inserted_id = get_last_inserted_id()
if last_inserted_id is None:
    last_inserted_id = 0
print(f"The last inserted ID is: {last_inserted_id}")


An error occurred while reading from the database: no such table: embeddings
The last inserted ID is: 0


In [6]:
import sqlite3

def fetch_document_id_by_chunk_id(database_path, chunk_id):
    # Establish the database connection
    connection = sqlite3.connect(database_path)
    cursor = connection.cursor()

    # Prepare and execute the SQL query
    query = """
    SELECT d.document_id
    FROM text_chunks tc
    JOIN sections s ON tc.section_id = s.section_id
    JOIN documents d ON tc.document_id = d.document_id
    WHERE tc.chunk_id = ?;
    """
    cursor.execute(query, (chunk_id,))

    # Fetch and process the result
    row = cursor.fetchone()
    if row:
        document_id = row[0]
        print(f"Document ID corresponding to Chunk ID {chunk_id}: {document_id}")
        return document_id
    else:
        print(f"No chunk found with the given Chunk ID {chunk_id}.")
        return 0

    # Close the database connection
    connection.close()

# Usage example
database_path = "wikipedia_chunks_256.db"
chunk_id_to_query = last_inserted_id
last_inserted_doc_id = fetch_document_id_by_chunk_id(database_path, chunk_id_to_query)


No chunk found with the given Chunk ID 0.


In [7]:
last_inserted_id

0

In [8]:
last_inserted_doc_id

0

In [10]:
docs = fetch_documents(database_path)
doc = next(docs)
del docs

In [11]:
print(doc["sections"].keys())

dict_keys([1, 2, 3, 4, 5, 6, 7])


In [14]:
first_section = list(doc["sections"].keys())[0]

In [15]:
doc["sections"][first_section]

{'title': 'Introduction',
 'chunks': {1: "Introduction - '''Anarchism''' is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions it claims maintain unnecessary coercion and hierarchy, typically including nation states, and capitalism",
  2: '. Anarchism advocates for the replacement of the state with stateless societies and voluntary free associations',
  3: '. As a historically left-wing movement, this reading of anarchism is placed on the farthest left of the political spectrum, usually described as the libertarian wing of the socialist movement (libertarian socialism).',
  4: 'Humans have lived in societies without formal hierarchies long before the establishment of states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose',
  5: '. Although traces of anarchist ideas are found all throughout history, modern anarchism emerged from the Enlightenment',
  6:

In [16]:
keys = sorted(list(doc["sections"].keys()))
first_section = keys[0]
keys = sorted(list(doc["sections"][first_section]["chunks"].keys()))
first_chunk = keys[0]
first_section_title = doc["sections"][first_section]["title"]
first_content = doc["sections"][first_section]["chunks"][first_chunk]
first_chunk = doc["title"]+": "+first_section_title+": "+first_content
first_chunk

"Anarchism: Introduction: Introduction - '''Anarchism''' is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions it claims maintain unnecessary coercion and hierarchy, typically including nation states, and capitalism"

In [12]:
import sqlite3

chunk_database_path = "wikipedia_chunks_256.db"
vector_database_path = "embedding_vectors_256_head_bge_base.db"

conn = sqlite3.connect(vector_database_path)
cursor = conn.cursor()
conn.execute("BEGIN TRANSACTION;")

embeddings = {}
last_record = 0
batch_size = 50
chunk_ids = []
chunks_to_embed = []
for idx, doc in tqdm(enumerate(fetch_documents(chunk_database_path)), total=6082528):
    if doc["id"] <= last_inserted_doc_id:
        #print(doc["id"])
        continue
    try:
        #print(doc["title"])

        keys = sorted(list(doc["sections"].keys()))
        #print(f"section keys: {keys}")
        first_section = keys[0]
        keys = sorted(list(doc["sections"][first_section]["chunks"].keys()))
        #print(f"chunk keys: {keys}")
        first_section_title = doc["sections"][first_section]["title"]
        if len(keys) == 0:
            first_content = ""
        else:
            first_chunk = keys[0]
            first_content = doc["sections"][first_section]["chunks"][first_chunk]
        first_chunk = doc["title"]+": "+first_section_title+": "+first_content

        chunk_id = doc["id"]
        chunk_ids.append(chunk_id)
        chunks_to_embed.append(first_chunk)
        if chunk_id - last_record > batch_size:
            last_record = chunk_id
            doc_embeddings = embed_chunks(chunks_to_embed)
            for chunk_id, chunk_embedding in zip(chunk_ids, doc_embeddings):
                #print(f"{chunk_id}: {chunk_embedding}")
                embedding_bytes = chunk_embedding.tobytes()
                embeddings[chunk_id] = embedding_bytes
            # Insert buffered vectors into the database and clear the buffer
            cursor.executemany("INSERT INTO embeddings (id, vector) VALUES (?, ?)", embeddings.items())
            conn.commit()
            #print(f"Saved {chunk_id} embedding vectors to the database.")
            conn.execute("BEGIN TRANSACTION;")
            embeddings.clear()
            chunk_ids.clear()
            chunks_to_embed.clear()
    except Exception as e:
        #import traceback
        #traceback.print_exc()
        print(f"An error occurred while processing doc: {doc['title']} : {e}")
        # Roll back any changes if an error occurs
        conn.rollback()
        conn.execute("BEGIN TRANSACTION;")
        #break

# Commit any remaining transactions
conn.commit()
conn.close()
# run 1 was at 246541 estimate 210h to finish
# max chunk id at this time was 2513706
# document id was 246541



  0%|          | 0/28176378 [00:00<?, ?it/s]

In [13]:
embeddings.keys()

dict_keys([])