<a href="https://colab.research.google.com/github/nickprock/appunti_data_science/blob/master/semantic-search/DBFR_haystack_qdrant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install haystack-ai
!pip install git+https://github.com/deepset-ai/haystack.git@main
!pip install qdrant-haystack
!pip install fastembed-haystack

Collecting git+https://github.com/deepset-ai/haystack.git@main
  Cloning https://github.com/deepset-ai/haystack.git (to revision main) to /tmp/pip-req-build-6ns4m2oe
  Running command git clone --filter=blob:none --quiet https://github.com/deepset-ai/haystack.git /tmp/pip-req-build-6ns4m2oe
  Resolved https://github.com/deepset-ai/haystack.git to commit 7178aa02532ebc55384b05ee3fd821e73c43133a
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
from haystack import Document, Pipeline
from haystack.components.writers import DocumentWriter
from haystack_integrations.components.retrievers.qdrant import QdrantHybridRetriever, QdrantSparseEmbeddingRetriever, QdrantEmbeddingRetriever
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
from haystack.document_stores.types import DuplicatePolicy
from haystack_integrations.components.embedders.fastembed import (
	FastembedTextEmbedder,
	FastembedDocumentEmbedder,
	FastembedSparseTextEmbedder,
	FastembedSparseDocumentEmbedder
)
from haystack.components.joiners import DocumentJoiner

document_store = QdrantDocumentStore(
    ":memory:",
    recreate_index=True,
    use_sparse_embeddings=True,
    embedding_dim = 384
)

documents = [
    Document(content="My name is Wolfgang and I live in Berlin"),
    Document(content="I saw a black horse running"),
    Document(content="Germany has many big cities"),
    Document(content="fastembed is supported by and maintained by Qdrant."),
]

indexing_pipeline = Pipeline()
indexing_pipeline.add_component("sparse_doc_embedder", FastembedSparseDocumentEmbedder(model="Qdrant/bm42-all-minilm-l6-v2-attentions"))
indexing_pipeline.add_component("dense_doc_embedder", FastembedDocumentEmbedder(model="BAAI/bge-small-en-v1.5"))
indexing_pipeline.add_component("writer", DocumentWriter(document_store=document_store, policy=DuplicatePolicy.OVERWRITE))
indexing_pipeline.connect("sparse_doc_embedder", "dense_doc_embedder")
indexing_pipeline.connect("dense_doc_embedder", "writer")

indexing_pipeline.run({"sparse_doc_embedder": {"documents": documents}})



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

stopwords.txt:   0%|          | 0.00/936 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/91.0M [00:00<?, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/706 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

model_optimized.onnx:   0%|          | 0.00/66.5M [00:00<?, ?B/s]

Calculating sparse embeddings: 100%|██████████| 4/4 [00:00<00:00, 52.05it/s]
Calculating embeddings: 100%|██████████| 4/4 [00:00<00:00,  6.53it/s]
100it [00:00, 3642.82it/s]           


{'writer': {'documents_written': 4}}

In [None]:
querying = Pipeline()
querying.add_component("sparse_text_embedder", FastembedSparseTextEmbedder(model="Qdrant/bm42-all-minilm-l6-v2-attentions"))
querying.add_component("dense_text_embedder", FastembedTextEmbedder(
	model="BAAI/bge-small-en-v1.5", prefix="Represent this sentence for searching relevant passages: ")
	)
querying.add_component("sparse_retriever", QdrantSparseEmbeddingRetriever(document_store=document_store))
querying.add_component("dense_retriever", QdrantEmbeddingRetriever(document_store=document_store))
querying.add_component("joiner", DocumentJoiner("distribution_based_rank_fusion"))

querying.connect("sparse_text_embedder.sparse_embedding", "sparse_retriever.query_sparse_embedding")
querying.connect("dense_text_embedder.embedding", "dense_retriever.query_embedding")
querying.connect("sparse_retriever.documents", "joiner")
querying.connect("dense_retriever.documents", "joiner")

question = "Who supports fastembed?"

results = querying.run(
    {"dense_text_embedder": {"text": question},
     "sparse_text_embedder": {"text": question}}, debug=True
)

print(results["joiner"]["documents"][0])

# Document(id=...,
#  content: 'fastembed is supported by and maintained by Qdrant.',
#  score: 1.0)

Calculating sparse embeddings: 100%|██████████| 1/1 [00:00<00:00, 26.58it/s]
Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  5.54it/s]


Document(id=09074d8d1e7a956dcfa8a0cc75cabad565448c6701e92dc41314eae07af56265, content: 'fastembed is supported by and maintained by Qdrant.', score: 0.7844045030951453)


In [None]:
results

{'joiner': {'documents': [Document(id=09074d8d1e7a956dcfa8a0cc75cabad565448c6701e92dc41314eae07af56265, content: 'fastembed is supported by and maintained by Qdrant.', score: 0.7844045030951453),
   Document(id=3cc62890af7a5b7efb4a29d0a1eb394e9ddd142c8c3263d08b1e17599cd4b071, content: 'Germany has many big cities', score: 0.44007128490376024),
   Document(id=62fad790ad2af927af9432c87330ed2ea5e31332cdec8e9d6235a5105ab0aaf5, content: 'My name is Wolfgang and I live in Berlin', score: 0.41458606113736063),
   Document(id=f52df56839c8082fb97edec2fd218e16bf431edae30adaf9bc13cdccb46b2883, content: 'I saw a black horse running', score: 0.360938150863734)]}}