In [6]:
import asyncio
import nest_asyncio
from dotenv import load_dotenv

load_dotenv()

nest_asyncio.apply()

In [37]:
from pydantic import BaseModel
from typing import List, Optional

# Define a simple output model for search results
class MilvusSearchResult(BaseModel):
    id: int
    text: str
    score: float

class MilvusSearchResults(BaseModel):
    results: List[MilvusSearchResult]
    query: str
    collection: str

In [27]:
import json
from typing import List, Dict, Any, Optional
from pymilvus import MilvusClient
from agents import function_tool, RunContextWrapper

@function_tool
async def search_milvus_text(
    ctx: RunContextWrapper[Any],
    collection_name: str,
    query_text: str,
    limit: int
) -> str:
    """Search for text documents in a Milvus collection using full text search.
    
    Args:
        collection_name: Name of the Milvus collection to search.
        query_text: The text query to search for.
        limit: Maximum number of results to return.
    """
    try:
        # Initialize Milvus client
        client = MilvusClient()
        
        # Prepare search parameters for BM25
        search_params = {
            "metric_type": "BM25",
            "params": {"drop_ratio_search": 0.2}
        }
        
        # Execute search with text query
        results = client.search(
            collection_name=collection_name,
            data=[query_text],
            anns_field="sparse",
            limit=limit,
            search_params=search_params,
            output_fields=["text"]
        )
        print(f'Results are: {results}')
        # Just return the raw results as a string
        return str(results)
    
    except Exception as e:
        print(f'Exception is: {e}')
        return f"Error searching Milvus: {str(e)}"

In [30]:
from pymilvus import DataType, Function, FunctionType, MilvusClient

client = MilvusClient(uri="http://localhost:19530")

schema = client.create_schema()

In [31]:
## 1. Basic Setup
# Simple schema that handles both text and vectors

schema.add_field(
    field_name="id", datatype=DataType.INT64, is_primary=True, auto_id=True
)
schema.add_field(
    field_name="text", datatype=DataType.VARCHAR, max_length=1000, enable_analyzer=True
)
schema.add_field(field_name="sparse", datatype=DataType.SPARSE_FLOAT_VECTOR)

{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 1000, 'enable_analyzer': True}}, {'name': 'sparse', 'description': '', 'type': <DataType.SPARSE_FLOAT_VECTOR: 104>}], 'enable_dynamic_field': False}

In [32]:
from pymilvus import Function

# Milvus handles tokenization and BM25 conversion
bm25_function = Function(
    name="text_bm25_emb",  # Function name
    input_field_names=["text"],  # Name of the VARCHAR field containing raw text data
    output_field_names=[
        "sparse"
    ],  # Name of the SPARSE_FLOAT_VECTOR field reserved to store generated embeddings
    function_type=FunctionType.BM25,
)

schema.add_function(bm25_function)

{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 1000, 'enable_analyzer': True}}, {'name': 'sparse', 'description': '', 'type': <DataType.SPARSE_FLOAT_VECTOR: 104>, 'is_function_output': True}], 'enable_dynamic_field': False, 'functions': [{'name': 'text_bm25_emb', 'description': '', 'type': <FunctionType.BM25: 1>, 'input_field_names': ['text'], 'output_field_names': ['sparse'], 'params': {}}]}

In [33]:
index_params = client.prepare_index_params()

index_params.add_index(field_name="sparse", index_type="AUTOINDEX", metric_type="BM25")

if client.has_collection('demo'):
    client.drop_collection('demo')
    
client.create_collection(
    collection_name="demo", schema=schema, index_params=index_params,
)

## 3. Loading Test Data
# Mix of technical terms and regular text
client.insert(
    "demo",
    [
        {"text": "Information retrieval helps users find relevant documents in large datasets."},
        {"text": "Search engines use information retrieval techniques to index and rank web pages."},
        {"text": "The core of IR is matching user queries with the most relevant content."},
        {"text": "Vector search is revolutionising modern information retrieval systems."},
        {"text": "Machine learning improves ranking algorithms in information retrieval."},
        {"text": "IR techniques include keyword-based search, semantic search, and vector search."},
        {"text": "Boolean retrieval is one of the earliest information retrieval methods."},
        {"text": "TF-IDF is a classic method used to score document relevance in IR."},
        {"text": "Modern IR systems integrate deep learning for better contextual understanding."},
        {"text": "Knowledge graphs enhance information retrieval by structuring relationships."},
        {"text": "Query expansion improves retrieval by adding synonyms and related terms."},
        {"text": "Information retrieval systems must balance precision and recall."},
        {"text": "Clustering algorithms help organise search results in IR."},
        {"text": "Personalisation in IR adapts results based on user behaviour."},
        {"text": "Search engines rely on indexing structures like inverted indexes."},
        {"text": "Latent Semantic Indexing (LSI) improves retrieval by understanding word relationships."},
        {"text": "Vector databases enhance IR by enabling similarity search."},
        {"text": "IR systems process structured, semi-structured, and unstructured data."},
        {"text": "Hybrid search combines traditional IR with vector search for better results."},
        {"text": "Query embedding techniques improve semantic search in IR."},
        {"text": "Multi-modal retrieval integrates text, image, and audio searches."},
        {"text": "Knowledge distillation helps optimise large IR models for efficiency."},
        {"text": "Efficient IR requires scalable indexing and fast query execution."},
        {"text": "Retrieval-augmented generation (RAG) improves LLMs with better factual grounding."},
        {"text": "Reinforcement learning is being explored to improve search ranking in IR."},
        {"text": "Clustering helps group similar documents for efficient retrieval."},
        {"text": "Metadata plays a crucial role in filtering results in IR systems."},
        {"text": "IR models benefit from transformer architectures like BERT and T5."},
        {"text": "Passage retrieval is crucial for question-answering systems."},
        {"text": "Domain-specific IR requires fine-tuned models for better accuracy."},
        {"text": "Personalised search enhances user experience in IR systems."},
        {"text": "Graph-based retrieval improves the ranking of interconnected documents."},
        {"text": "Zero-shot retrieval is gaining traction in open-domain information retrieval."},
        {"text": "Query rewriting techniques improve recall in IR systems."},
        {"text": "Approximate nearest neighbour (ANN) search speeds up large-scale IR tasks."},
        {"text": "Efficient indexing strategies reduce latency in search queries."},
        {"text": "Long-form document retrieval requires special handling in IR models."},
        {"text": "Multi-step retrieval-refinement loops enhance search quality."},
        {"text": "Vector quantisation reduces memory footprint in large-scale IR."},
        {"text": "AI-powered chatbots leverage IR to retrieve relevant responses."},
        {"text": "Multilingual IR requires cross-lingual embeddings for better search results."},
        {"text": "Graph neural networks (GNNs) enhance entity-based information retrieval."},
        {"text": "Search efficiency is crucial for real-time IR applications."},
        {"text": "Knowledge-grounded IR improves AI-generated responses."},
        {"text": "Adversarial training helps IR models handle query variations."},
        {"text": "Combining dense and sparse retrieval improves hybrid search."},
        {"text": "Query intent classification enhances IR performance."},
        {"text": "Self-supervised learning is advancing IR techniques."},
        {"text": "Real-time document updates improve search freshness in IR."},
        {"text": "AI-powered IR is reshaping e-commerce product recommendations."},
        {"text": "Milvus is an open-source vector database designed for AI-powered search."},
        {"text": "Milvus enables fast and scalable similarity search on high-dimensional data."},
        {"text": "With Milvus, developers can build applications that support image, text, and video retrieval."},
        {"text": "Milvus integrates well with deep learning frameworks like PyTorch and TensorFlow."},
        {"text": "The core of Milvus is optimised for approximate nearest neighbour (ANN) search."},
        {"text": "Milvus supports hybrid search combining structured and unstructured data."},
        {"text": "Large-scale AI applications rely on Milvus for efficient vector retrieval."},
        {"text": "Milvus makes it easy to perform high-speed similarity searches."},
        {"text": "Cloud-native by design, Milvus scales effortlessly with demand."},
        {"text": "Milvus powers applications in recommendation systems, fraud detection, and genomics."},
        {"text": "The latest version of Milvus introduces faster indexing and lower latency."},
        {"text": "Milvus supports HNSW, IVF_FLAT, and other popular ANN algorithms."},
        {"text": "Vector embeddings from models like OpenAI’s CLIP can be indexed in Milvus."},
        {"text": "Milvus has built-in support for multi-tenancy in enterprise use cases."},
        {"text": "The Milvus community actively contributes to improving its performance."},
        {"text": "Milvus integrates with data pipelines like Apache Kafka for real-time updates."},
        {"text": "Using Milvus, companies can enhance search experiences with vector search."},
        {"text": "Milvus plays a crucial role in powering AI search in medical research."},
        {"text": "Image search applications leverage Milvus for accurate results."},
        {"text": "Companies use Milvus to optimise personalised recommendations."},
        {"text": "Text embedding models work seamlessly with Milvus for NLP applications."},
        {"text": "Milvus simplifies deploying scalable AI-driven search engines."},
        {"text": "The Milvus Python SDK makes it easy to interact with the database."},
        {"text": "High-dimensional data retrieval is much faster with Milvus than traditional databases."},
        {"text": "E-commerce platforms integrate Milvus for personalised shopping experiences."},
        {"text": "Milvus is widely used in cybersecurity for anomaly detection."},
        {"text": "Social media platforms can enhance content discovery with Milvus."},
        {"text": "Image and video fingerprinting applications rely on Milvus."},
        {"text": "Geospatial search benefits from Milvus’ vector search capabilities."},
        {"text": "Milvus supports both cloud and on-premises deployments."},
        {"text": "Developers can quickly deploy Milvus using Docker and Kubernetes."},
        {"text": "The scalability of Milvus makes it suitable for big data applications."},
        {"text": "Milvus optimises indexing strategies for faster retrieval."},
        {"text": "Knowledge management systems benefit from Milvus-powered search."},
        {"text": "The indexing mechanisms in Milvus make querying billions of vectors efficient."},
        {"text": "With Milvus, AI agents can retrieve information contextually."},
        {"text": "Milvus integrates with LangChain for advanced RAG pipelines."},
        {"text": "Open-source contributors continue to enhance Milvus’ search performance."},
        {"text": "Multi-modal search in Milvus enables applications beyond text and images."},
        {"text": "Milvus has an intuitive REST API for easy integration."},
        {"text": "Milvus’ FAISS and HNSW backends provide flexibility in indexing."},
        {"text": "The architecture of Milvus ensures fault tolerance and high availability."},
        {"text": "Video recommendation engines benefit from Milvus-powered search."},
        {"text": "Data scientists use Milvus to accelerate experimentation with embeddings."},
        {"text": "Milvus integrates seamlessly with LLM-based applications."},
        {"text": "Startups leverage Milvus to build next-gen AI-powered products."},
        {"text": "The Milvus Roadmap shows continuous improvements in speed and features."},
        {"text": "Milvus Cloud offers a managed solution for vector search at scale."},
        {"text": "Real-time AI search applications thrive with Milvus’ efficiency."},
        {"text": "The future of AI search is being shaped by Milvus and similar vector databases."}
    ],
)

{'insert_count': 100, 'ids': [456486814660619039, 456486814660619040, 456486814660619041, 456486814660619042, 456486814660619043, 456486814660619044, 456486814660619045, 456486814660619046, 456486814660619047, 456486814660619048, 456486814660619049, 456486814660619050, 456486814660619051, 456486814660619052, 456486814660619053, 456486814660619054, 456486814660619055, 456486814660619056, 456486814660619057, 456486814660619058, 456486814660619059, 456486814660619060, 456486814660619061, 456486814660619062, 456486814660619063, 456486814660619064, 456486814660619065, 456486814660619066, 456486814660619067, 456486814660619068, 456486814660619069, 456486814660619070, 456486814660619071, 456486814660619072, 456486814660619073, 456486814660619074, 456486814660619075, 456486814660619076, 456486814660619077, 456486814660619078, 456486814660619079, 456486814660619080, 456486814660619081, 456486814660619082, 456486814660619083, 456486814660619084, 456486814660619085, 456486814660619086, 4564868146

In [38]:

from agents import Agent, Runner, WebSearchTool, trace

async def main():
    agent = Agent(
        name="Milvus Searcher",
        instructions="You are a helpful agent that can search through Milvus vector database using full text search.",
        tools=[
            WebSearchTool(user_location={"type": "approximate", "city": "New York"}),
            search_milvus_text  # Use our text search tool
        ],
        output_type=MilvusSearchResults
    )

    with trace("Milvus search example"):
        result = await Runner.run(
            agent,
            "Find documents in the 'demo' collection that are similar to this concept: 'information retrieval'"
        )
        print(result.final_output)

In [39]:
asyncio.run(main())

Results are: data: ["[{'id': 456486814660619045, 'distance': 4.198044776916504, 'entity': {'text': 'Boolean retrieval is one of the earliest information retrieval methods.'}}, {'id': 456486814660619071, 'distance': 4.052548885345459, 'entity': {'text': 'Zero-shot retrieval is gaining traction in open-domain information retrieval.'}}, {'id': 456486814660619043, 'distance': 3.98502779006958, 'entity': {'text': 'Machine learning improves ranking algorithms in information retrieval.'}}, {'id': 456486814660619042, 'distance': 3.98502779006958, 'entity': {'text': 'Vector search is revolutionising modern information retrieval systems.'}}, {'id': 456486814660619048, 'distance': 3.98502779006958, 'entity': {'text': 'Knowledge graphs enhance information retrieval by structuring relationships.'}}]"]
results=[MilvusSearchResult(id=456486814660619045, text='Boolean retrieval is one of the earliest information retrieval methods.', score=4.198044776916504), MilvusSearchResult(id=456486814660619071, t