In [16]:
import torch
import chromadb

from openai import OpenAI

from FlagEmbedding import BGEM3FlagModel
from transformers import XLMRobertaModel, XLMRobertaTokenizer
from sentence_transformers import SentenceTransformer

## Importing models and databases

In [17]:
xlm_tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
xlm_model = XLMRobertaModel.from_pretrained("xlm-roberta-base")

In [18]:
xlm_client = chromadb.PersistentClient(
    path="/home/murad/Documents/self-study/contextual_embeddings/databases/xlm_collection"
)

xlm_collection = xlm_client.get_collection("pdf_chunks")

In [19]:
bge_model = BGEM3FlagModel(
    model_name_or_path="BAAI/bge-m3",
    use_fp16=True
)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [20]:
bge_client = chromadb.PersistentClient(
    path="/home/murad/Documents/self-study/contextual_embeddings/databases/bge_collection"
)
bge_collection = bge_client.get_collection("pdf_chunks")

In [21]:
labse_model = SentenceTransformer(
    model_name_or_path="sentence-transformers/LaBSE"
)

In [22]:
labse_client = chromadb.PersistentClient(
    path="/home/murad/Documents/self-study/contextual_embeddings/databases/labse_collection"
)
labse_collection = labse_client.get_collection("pdf_chunks")

In [23]:
openai_model = OpenAI(api_key='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')

In [24]:
openai_client = chromadb.PersistentClient(
    path="/home/murad/Documents/self-study/contextual_embeddings/databases/openai_collection"
)
openai_collection = openai_client.get_collection("pdf_chunks")

## Query

In [25]:
query = "Azərbaycanda hansı süni intellekt strategiyası mövcuddur?"

### XLM RoBERTa model result

In [26]:
xlm_model.eval()

with torch.no_grad():
    inputs = xlm_tokenizer(query, return_tensors="pt", padding=True)
    outputs = xlm_model(**inputs)
    query_embedding_xlm = outputs.last_hidden_state[:, 0, :].squeeze(0).numpy().tolist()

In [27]:
xlm_result = xlm_collection.query(
    query_embeddings=query_embedding_xlm,
    n_results=5,
    include=["distances", "metadatas", "documents"]
)

In [28]:
with open("xlm_result.txt", "w", encoding="utf-8") as f:
    f.write(str(xlm_result["documents"]))

### BGE-M3 model result

In [29]:
query_embedding_bge = bge_model.encode(sentences=query,
                                       max_length=1024,
                                       batch_size=12)['dense_vecs']
query_embedding_bge = query_embedding_bge.tolist()

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [30]:
bge_result = bge_collection.query(
    query_embeddings=query_embedding_bge,
    n_results=5,
    include=["distances", "metadatas", "documents"]
)

In [31]:
with open("bge_result.txt", "w", encoding="utf-8") as f:
    f.write(str(bge_result["documents"]))

### LaBSE model result

In [32]:
query_embedding_labse = labse_model.encode(sentences=query)
query_embedding_labse = query_embedding_labse.tolist()

In [33]:
labse_result = labse_collection.query(
    query_embeddings=query_embedding_labse,
    n_results=5,
    include=["distances", "metadatas", "documents"]
)

In [34]:
with open("labse_result.txt", "w", encoding="utf-8") as f:
    f.write(str(labse_result["documents"]))

### OPENAI model result

In [35]:
def get_openai_embeddings(texts, model="text-embedding-3-small"):
    """Generate embeddings for a list of texts using OpenAI API"""
    response = openai_model.embeddings.create(
        input=texts,
        model=model
    )
    
    # Extract embeddings from the response
    embeddings = [item.embedding for item in response.data]
    return embeddings

# Use in your workflow
query_embeddings_openai = get_openai_embeddings(query)

In [36]:
openai_result = openai_collection.query(
    query_embeddings=query_embeddings_openai,
    n_results=5,
    include=["distances", "metadatas", "documents"]
)

In [37]:
openai_result

{'ids': [['chunk_23', 'chunk_2', 'chunk_0', 'chunk_26', 'chunk_21']],
 'distances': [[0.6106777614404577,
   0.7667791586044204,
   0.7702448629085455,
   0.8159988161732269,
   0.8270810206209869]],
 'metadatas': [[{'source': 'chunk_23.txt'},
   {'source': 'chunk_2.txt'},
   {'source': 'chunk_0.txt'},
   {'source': 'chunk_26.txt'},
   {'source': 'chunk_21.txt'}]],
 'embeddings': None,
 'documents': [[None, None, None, None, None]],
 'uris': None,
 'data': None}