In [1]:
from preprocessor import Preprocessor
from FlagEmbedding import BGEM3FlagModel
from transformers import XLMRobertaModel, XLMRobertaTokenizer
from sentence_transformers import SentenceTransformer
import torch

In [2]:
preprocessor = Preprocessor(filepath="sample_doc.pdf")

In [3]:
text = preprocessor.read_text()

In [4]:
clean_text = preprocessor.clean_text(text=text)

In [5]:
chunks = preprocessor.split_into_chunks(text=clean_text)

In [6]:
preprocessor.save_chunks(chunks=chunks,
                         path="/home/murad/Documents/self-study/contextual_embeddings/chunks")

In [7]:
xlm_tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
xlm_model = XLMRobertaModel.from_pretrained("xlm-roberta-base")

In [8]:
bge_model = BGEM3FlagModel(model_name_or_path="BAAI/bge-m3",
                           use_fp16=True)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [9]:
labse_model = SentenceTransformer("sentence-transformers/LaBSE")

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

## XLM-RoBERTa

In [10]:
xlm_model.eval()
xlm_embeddings = []

with torch.no_grad():
    for chunk in chunks:
        inputs = xlm_tokenizer(chunk, return_tensors="pt", padding=True)
        outputs = xlm_model(**inputs)
        chunk_embedding = outputs.last_hidden_state[:, 0, :]
        xlm_embeddings.append(chunk_embedding)

In [11]:
xlm_collection = preprocessor.create_database(chunks=chunks,
                                              embeddings=xlm_embeddings,
                                              collection_name="xlm_collection")

Using ChromaDB directory: /home/murad/Documents/contextual_embeddings/databases/xlm_collection
Successfully stored 27 chunks with embeddings.
Database location: /home/murad/Documents/contextual_embeddings/databases/xlm_collection


### BGE-M3

In [12]:
bge_embeddings = bge_model.encode(
    sentences=chunks,
    batch_size=12, 
    max_length=8192
)["dense_vecs"]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [13]:
bge_collection = preprocessor.create_database(chunks=chunks,
                                              embeddings=bge_embeddings,
                                              collection_name="bge_collection")

Using ChromaDB directory: /home/murad/Documents/contextual_embeddings/databases/bge_collection
Successfully stored 27 chunks with embeddings.
Database location: /home/murad/Documents/contextual_embeddings/databases/bge_collection


### LaBSE 

In [14]:
labse_embeddings = labse_model.encode(sentences=chunks)

In [15]:
labse_collection = preprocessor.create_database(chunks=chunks,
                                                embeddings=labse_embeddings,
                                                collection_name="labse_collection")

Using ChromaDB directory: /home/murad/Documents/contextual_embeddings/databases/labse_collection
Successfully stored 27 chunks with embeddings.
Database location: /home/murad/Documents/contextual_embeddings/databases/labse_collection
