In [1]:
import pypdf
import re
from typing import List
import os
from FlagEmbedding import BGEM3FlagModel
import os

os.environ['CURL_CA_BUNDLE'] = ''

In [2]:
reader = pypdf.PdfReader("ai_doc.pdf")

In [3]:
all_text = ""
for i in range (0, 8):
    text = reader.pages[i].extract_text()
    all_text += text

In [4]:
def clean_text(text: str) -> str:
    """Clean and normalize text content"""
    # Removing multiple lines
    text = re.sub(r"\n+", "\n", text)

    # Removing multiple spaces
    text = re.sub(r"\s+", " ", text)

    # Remove HTML comments 
    text = re.sub(r"<!--.?-->", "", text)
    text = re.sub(r"&", "u", text)

    return text

In [5]:
def split_into_chunks(text: str, max_chunk_size: int = 1024) -> List[str]:
    """Creates chunks with size of max_chunk_size parameter from the document."""
    chunks = []
    sentences = re.split(r'(?<=[.!?])\s+', text) # Splitting sentences from the document
    current_chunk = []
    current_size = 0

    for sentence in sentences:
        sentence_size = len(sentence)
        if current_size+sentence_size > max_chunk_size and current_chunk:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_size = 0
        
        current_chunk.append(sentence)
        current_size += sentence_size
    
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks

In [6]:
clean = clean_text(text=all_text)

In [7]:
chunks = split_into_chunks(text=clean, max_chunk_size=600)

In [8]:
for i, chunk in enumerate(chunks):
    filename = f"chunk_{i}.txt"
    filepath = f"/home/murad/Documents/self-study/contextual_embeddings/chunks/{filename}"

    try:
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(chunk)
    except Exception as e:
        print(f"Error saving chunk to {filepath}: {str(e)}.")

In [9]:
def load_documents(path = "/home/murad/Documents/self-study/contextual_embeddings/chunks"):
    """Loading all processed documents."""
    # Loading Python docs
    documents = []
    for file_path in os.listdir(path):
        with open(os.path.join(path, filepath), "r", encoding="utf-8") as f:
            content = f.read()
        documents.append(content)
    
    return documents

In [10]:
path = "/home/murad/Documents/self-study/contextual_embeddings/chunks"
docs = []
for file_path in os.listdir(path):
    with open(os.path.join(path, file_path), "r", encoding="utf-8") as f:
        content = f.read()
    docs.append(content)

In [11]:
# Then load your model
model = BGEM3FlagModel("BAAI/bge-m3", 
                      use_fp16=True)



Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [12]:
docs = [clean_text(doc) for doc in docs]

In [13]:
embeddings_1 = model.encode(docs, 
                            batch_size=12, 
                            max_length=8192, # If you don't need such a long length, you can set a smaller value to speed up the encoding process.
                            )['dense_vecs']

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [14]:
len(embeddings_1[0])

1024

In [15]:
import chromadb
import tempfile
import numpy as np

# Create a temporary directory with write permissions
temp_dir = tempfile.mkdtemp()
print(f"Using ChromaDB directory: {temp_dir}")

# Initialize ChromaDB client
client = chromadb.PersistentClient(path=temp_dir)

# Create a collection WITHOUT an embedding function
collection = client.create_collection(
    name="pdf_chunks",
    metadata={"description": "PDF chunks with BGE-M3 embeddings"}
)

# Prepare your data
document_ids = [f"chunk_{i}" for i in range(len(docs))]
metadatas = [{"source": f"chunk_{i}.txt"} for i in range(len(docs))]

# Convert numpy arrays to lists if needed
embeddings_list = []
for emb in embeddings_1:
    if isinstance(emb, np.ndarray):
        embeddings_list.append(emb.tolist())
    else:
        embeddings_list.append(emb)

# Add documents with pre-computed embeddings
collection.add(
    documents=docs,              # The document texts
    ids=document_ids,            # Unique IDs
    metadatas=metadatas,         # Metadata
    embeddings=embeddings_list   # Your pre-computed embeddings
)

print(f"Successfully stored {len(docs)} documents with embeddings in ChromaDB")
print(f"Collection name: pdf_chunks")
print(f"Database location: {temp_dir}")

Using ChromaDB directory: /tmp/tmpf9dwtvil
Successfully stored 37 documents with embeddings in ChromaDB
Collection name: pdf_chunks
Database location: /tmp/tmpf9dwtvil


In [19]:
query = ["Turinq testi nədir?"]
query_embedding = model.encode(sentences=query,
                               batch_size=12,
                               max_length=1024)["dense_vecs"]
query_embedding = query_embedding.tolist()

In [20]:
collection.query(query_embeddings=query_embedding,
                 n_results=10,
                 include=["documents", "distances", "metadatas"])

{'ids': [['chunk_16',
   'chunk_3',
   'chunk_22',
   'chunk_29',
   'chunk_25',
   'chunk_20',
   'chunk_7',
   'chunk_21',
   'chunk_23',
   'chunk_11']],
 'distances': [[0.6031309300372137,
   0.7156820159450029,
   0.9340197892554905,
   1.069768826620262,
   1.1289514818185007,
   1.1640132529745169,
   1.195128053477976,
   1.2231285490888089,
   1.2289387843506814,
   1.2372925567881798]],
 'metadatas': [[{'source': 'chunk_16.txt'},
   {'source': 'chunk_3.txt'},
   {'source': 'chunk_22.txt'},
   {'source': 'chunk_29.txt'},
   {'source': 'chunk_25.txt'},
   {'source': 'chunk_20.txt'},
   {'source': 'chunk_7.txt'},
   {'source': 'chunk_21.txt'},
   {'source': 'chunk_23.txt'},
   {'source': 'chunk_11.txt'}]],
 'embeddings': None,
 'documents': [['Turinq, bir kompüterin süni zəkaya sahib ola biləcəyini və hakimi çaşdıracaq qədər inandırıcı ola biləcəyini iddia edir. Hakim, insanla yoxsa kompüterlə danışdığını anlamayacaq. Bu testə Turinq testi deyilir.Turinq testinin məqsədi və əhəm