In [1]:
%pip install -U sentence_transformers mixedbread-ai

/Users/juliuslipp/.zshenv:.:1: no such file or directory: /Users/juliuslipp/.cargo/env
Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.34.0 (from sentence_transformers)
  Using cached transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub>=0.15.1 (from sentence_transformers)
  Using cached huggingface_hub-0.23.4-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers<5.0.0,>=4.34.0->sentence_transformers)
  Using cached tokenizers-0.19.1-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers<5.0.0,>=4.34.0->sentence_transformers)
  Using cached safetensors-0.4.3-cp310-cp310-macosx_11_0_arm64.whl.metadata (3.8 kB)
Collecting fsspec (from torch>=1.11.0->sentence_transformers)
  Using cached fsspec-2024.6.0-py3-none-any.whl.metadata (11 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.w

In [6]:
from haystack import Document
from datasets import load_dataset

ds = load_dataset("rajuptvs/ecommerce_products_clip")

documents = [
    Document(
        id=str(i),
        content=data["Description"], meta={
        "name": data["Product_name"],
        "price": data["Price"],
        "colors": data["colors"],
        "pattern": data["Pattern"],
        "extra": data["Other Details"]
    }) for i, data in enumerate(ds["train"])
]

meta_fields = documents[0].meta.keys()

In [7]:
import os
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.writers import DocumentWriter
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from mixedbread_ai_haystack import MixedbreadAIDocumentEmbedder, MixedbreadAITextEmbedder, MixedbreadAIReranker

os.environ["MXBAI_API_KEY"] = "YOUR_API_KEY"

# -------------------------------------
# Instance Definition
# -------------------------------------
document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
document_writer = DocumentWriter(document_store=document_store)
embedding_retriever = InMemoryEmbeddingRetriever(document_store=document_store, top_k=20)

embed_model = "mixedbread-ai/mxbai-embed-large-v1"
reranking_model = "mixedbread-ai/mxbai-rerank-large-v1" # OR use 'default' for the latest model.

text_embedder = MixedbreadAITextEmbedder(model=embed_model)
document_embedder = MixedbreadAIDocumentEmbedder(model=embed_model, max_concurrency=3, meta_fields_to_embed=meta_fields, show_progress_bar=True)
reranker = MixedbreadAIReranker(model=reranking_model, meta_fields_to_rank=meta_fields, top_k=5)


# -------------------------------------
# Indexing Pipeline
# -------------------------------------
indexing_pipeline = Pipeline()
indexing_pipeline.add_component(instance=document_embedder, name="document_embedder")
indexing_pipeline.add_component(instance=document_writer, name="document_writer")
indexing_pipeline.connect("document_embedder", "document_writer")

# -------------------------------------
# Indexing Pipeline
# -------------------------------------
query_pipeline = Pipeline()
query_pipeline.add_component(instance=text_embedder, name="text_embedder")
query_pipeline.add_component(instance=embedding_retriever, name="embedding_retriever")
query_pipeline.add_component(instance=reranker, name="reranker")
query_pipeline.connect("text_embedder", "embedding_retriever")
query_pipeline.connect("embedding_retriever.documents", "reranker.documents")

<haystack.core.pipeline.pipeline.Pipeline object at 0x28b753160>
🚅 Components
  - text_embedder: MixedbreadAITextEmbedder
  - embedding_retriever: InMemoryEmbeddingRetriever
  - reranker: MixedbreadAIReranker
🛤️ Connections
  - text_embedder.embedding -> embedding_retriever.query_embedding (List[float])
  - embedding_retriever.documents -> reranker.documents (List[Document])

In [3]:
# Index the dataset
indexing_pipeline.run({"document_embedder": {"documents": documents}})

MixedbreadAIDocumentEmbedder - Calculating embedding batches: 100%|██████████| 15/15 [00:06<00:00,  2.24it/s]


{'document_embedder': {'meta': {'model': 'mixedbread-ai/mxbai-embed-large-v1',
   'object': <ObjectType.LIST: 'list'>,
   'normalized': True,
   'encoding_format': <EncodingFormat.FLOAT: 'float'>,
   'dimensions': 1024,
   'usage': Usage(prompt_tokens=119007, total_tokens=119007, completion_tokens=None)}},
 'document_writer': {'documents_written': 1913}}

In [5]:
# Query to get results
query = "I am looking for a regular fit t-shirt in blue color. Ideally without any prints. What are my options?"

results = query_pipeline.run(
    {
        "text_embedder": {"text": query},
        "reranker": {"query": query}
    }
)

print(results["reranker"]["documents"])

[Document(id=701, content: 'unknown', meta: {'name': 'Men Regular Fit Solid Casual Shirt', 'price': '₹1,434', 'colors': 'Blue', 'pattern': 'Solid', 'extra': 'unknown'}, score: 0.21203613), Document(id=629, content: 'unknown', meta: {'name': 'Men Regular Fit Solid Casual Shirt', 'price': '₹1,212', 'colors': 'Blue', 'pattern': 'Solid', 'extra': 'unknown'}, score: 0.20983887), Document(id=102, content: 'High quality premium Full sleeves Plain  Shirt direct from the manufacturers. Gives you perfect fit,...', meta: {'name': 'Men Regular Fit Printed Spread Collar Casual Shirt', 'price': '₹349', 'colors': 'Dark Blue', 'pattern': 'Printed', 'extra': 'unknown'}, score: 0.16711426), Document(id=575, content: 'Shirt Style: Casual shirt with slim collar, roll up sleeves, rounded hem and button closure. Fabric:...', meta: {'name': 'Men Regular Fit Printed Slim Collar Casual Shirt', 'price': '₹499', 'colors': 'Blue', 'pattern': 'Printed', 'extra': '100% cotton'}, score: 0.13293457), Document(id=461,