# Elasticsearch Hybrid Search Experiment

Testing hybrid search (BM25 + vector) on the Wikipedia dataset.

In [None]:
import os
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv

load_dotenv()

ELASTICSEARCH_ENDPOINT = os.getenv("ELASTICSEARCH_ENDPOINT")
ELASTICSEARCH_API_KEY = os.getenv("ELASTICSEARCH_API_KEY")
INDEX_NAME = os.getenv("INDEX_NAME").lower()

In [None]:
# Initialize clients
client = Elasticsearch(
    ELASTICSEARCH_ENDPOINT,
    api_key=ELASTICSEARCH_API_KEY
)

model = SentenceTransformer('all-MiniLM-L6-v2')

print(f"Connected to Elasticsearch: {client.info()['version']['number']}")
print(f"Index: {INDEX_NAME}")
print(f"Document count: {client.count(index=INDEX_NAME)['count']:,}")

In [None]:
def hybrid_search(query: str, k: int = 5, vector_weight: float = 0.5):
    """Perform hybrid search combining BM25 and vector similarity."""
    
    # Generate query embedding
    query_embedding = model.encode(query).tolist()
    
    # Hybrid search using RRF (Reciprocal Rank Fusion)
    response = client.search(
        index=INDEX_NAME,
        body={
            "size": k,
            "query": {
                "bool": {
                    "should": [
                        # BM25 text search
                        {
                            "multi_match": {
                                "query": query,
                                "fields": ["title^2", "text"],
                                "boost": 1 - vector_weight
                            }
                        },
                        # Vector similarity search
                        {
                            "script_score": {
                                "query": {"match_all": {}},
                                "script": {
                                    "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                                    "params": {"query_vector": query_embedding}
                                },
                                "boost": vector_weight
                            }
                        }
                    ]
                }
            },
            "_source": ["id", "title", "text", "chunk_index"]
        }
    )
    
    return response["hits"]["hits"]

In [None]:
def display_results(results):
    """Display search results in a readable format."""
    for i, hit in enumerate(results):
        score = hit["_score"]
        source = hit["_source"]
        print(f"\n{'='*80}")
        print(f"Result {i+1} | Score: {score:.4f}")
        print(f"Title: {source['title']}")
        print(f"Chunk: {source['chunk_index']}")
        print(f"-"*40)
        print(source['text'][:500] + "..." if len(source['text']) > 500 else source['text'])

In [None]:
# Test query
query = "Who was Alexander Obolensky?"

print(f"Query: {query}")
results = hybrid_search(query, k=5)
display_results(results)

## Compare Search Methods

In [None]:
def bm25_search(query: str, k: int = 5):
    """BM25 text-only search."""
    response = client.search(
        index=INDEX_NAME,
        body={
            "size": k,
            "query": {
                "multi_match": {
                    "query": query,
                    "fields": ["title^2", "text"]
                }
            },
            "_source": ["id", "title", "text", "chunk_index"]
        }
    )
    return response["hits"]["hits"]


def vector_search(query: str, k: int = 5):
    """Vector-only search using kNN."""
    query_embedding = model.encode(query).tolist()
    
    response = client.search(
        index=INDEX_NAME,
        body={
            "size": k,
            "knn": {
                "field": "embedding",
                "query_vector": query_embedding,
                "k": k,
                "num_candidates": 100
            },
            "_source": ["id", "title", "text", "chunk_index"]
        }
    )
    return response["hits"]["hits"]

In [None]:
query = "Who was Alexander Obolensky?"

print("=" * 80)
print("BM25 SEARCH")
print("=" * 80)
display_results(bm25_search(query))

print("\n\n")
print("=" * 80)
print("VECTOR SEARCH")
print("=" * 80)
display_results(vector_search(query))

print("\n\n")
print("=" * 80)
print("HYBRID SEARCH")
print("=" * 80)
display_results(hybrid_search(query))