In [18]:
from haystack.document_stores.types import DuplicatePolicy
from haystack import Document
from haystack import Pipeline
from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder

from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore

document_store = QdrantDocumentStore(
    ":memory:",
    recreate_index=True,
    return_embedding=True,
    wait_result_from_api=True,
)

documents = [Document(content="There are over 7,000 languages spoken around the world today."),
						Document(content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors."),
						Document(content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bioluminescent waves.")]

# document_embedder = SentenceTransformersDocumentEmbedder(model="intfloat/e5-base")  
# document_embedder = SentenceTransformersDocumentEmbedder(model="sdadas/mmlw-retrieval-roberta-base")  
document_embedder = SentenceTransformersDocumentEmbedder(model="../lab5/best_model_f1_rand")  
document_embedder.warm_up()
documents_with_embeddings = document_embedder.run(documents)

document_store.write_documents(documents_with_embeddings.get("documents"), policy=DuplicatePolicy.OVERWRITE)

retriever1 = QdrantEmbeddingRetriever(document_store=document_store)

query_pipeline = Pipeline()
query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder())
query_pipeline.add_component("retriever", retriever1)
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")

query = "How many languages are there?"

result = query_pipeline.run({"text_embedder": {"text": query}})

print(result['retriever']['documents'][0])


Batches: 100%|██████████| 1/1 [00:00<00:00,  8.32it/s]
100it [00:00, 24212.34it/s]          
Batches: 100%|██████████| 1/1 [00:00<00:00, 20.03it/s]

Document(id=cfe93bc1c274908801e6670440bf2bbba54fad792770d57421f85ffa2a4fcc94, content: 'There are over 7,000 languages spoken around the world today.', score: -0.00494105619758533)





In [None]:
from haystack.nodes import EmbeddingRetriever
from haystack.document_stores import InMemoryDocumentStore

doc_store = InMemoryDocumentStore(
    similarity="cosine",  # the e5 models were trained with a cosine similarity function
    embedding_dim=768
)

e5 = EmbeddingRetriever(
    document_store=doc_store,
    embedding_model="intfloat/e5-base-v2",
    model_format="transformers",  # Make sure we specify the transformers model format
    pooling_strategy="reduce_mean",  # This is the pooling method used to train the e5 models
    top_k=20,
    max_seq_len=512,
)
doc_store.update_embeddings(e5)