In [17]:
import minsearch
import pandas as pd
from tqdm.auto import tqdm

## Load datasets

In [18]:
plants_data = pd.read_csv("../data/plants_data.csv")
documents = plants_data.to_dict(orient='records')

df_question = pd.read_csv("../data/ground-truth-retrieval-5q.csv")
ground_truth = df_question.to_dict(orient='records')

In [19]:
documents[:3]

[{'id': 0,
  'name': 'Adelonema wallisii',
  'summary': 'Adelonema wallisii (synonym Homalomena wallisii) is a species of aroid plant (family Araceae) native to Venezuela, Colombia, and Panama.\n\n',
  'cultivation': 'No data available',
  'toxicity': 'No data available'},
 {'id': 1,
  'name': 'Adenium obesum',
  'summary': 'Adenium obesum, more commonly known as a desert rose, is a poisonous species of flowering plant belonging to the tribe Nerieae of the subfamily Apocynoideae of the dogbane family, Apocynaceae. It is native to the Sahel regions south of the Sahara (from Mauritania and Senegal to Sudan), tropical and subtropical eastern and southern Africa, as well as the Arabian Peninsula. Other names for the flower include Sabi star, kudu, mock azalea, and impala lily. Adenium obesum is a popular houseplant and bonsai in temperate regions.\n\n',
  'cultivation': "Adenium obesum is a popular houseplant and bonsai in temperate regions. It requires a sunny location and a minimum indoo

In [20]:
ground_truth[:3]

[{'id': 0, 'question': 'Where is Adelonema wallisii originally found?'},
 {'id': 0, 'question': 'What family does the Adelonema wallisii belong to?'},
 {'id': 0,
  'question': 'Can you tell me about the common names of Adelonema wallisii?'}]

## Evaluation metrics

In [21]:
def hit_rate(relevance_total):
    """
    Calculate the Hit Rate for a set of ranked results.

    The Hit Rate measures the proportion of queries for which at least one 
    relevant item is present in the returned results, regardless of its rank.

    Args:
        relevance_total (list of list of bool): 
            A list where each sublist corresponds to a single query's ranked 
            results. Each boolean in a sublist indicates whether the result at 
            that rank is relevant (True) or not (False).

    Returns:
        float: Hit Rate value between 0 and 1.
    """
    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt += 1
    return cnt / len(relevance_total)


def recall_at_1(relevance_total):
    """
    Calculate Recall@1 (Recall at rank 1) for a set of ranked results.

    Recall@1 measures the proportion of queries for which the first returned 
    result is relevant.

    Args:
        relevance_total (list of list of bool): 
            A list where each sublist corresponds to a single query's ranked 
            results. Each boolean in a sublist indicates whether the result at 
            that rank is relevant (True) or not (False).

    Returns:
        float: Recall@1 value between 0 and 1.
    """
    cnt = 0
    for line in relevance_total:
        if line[0]:
            cnt += 1
    return cnt / len(relevance_total)

def mrr(relevance_total):
    """
    Calculate the Mean Reciprocal Rank (MRR) for a set of ranked results.

    MRR measures the average reciprocal rank of the first relevant result 
    across queries. If a query has multiple relevant results, only the first 
    relevant result contributes to the score.

    Args:
        relevance_total (list of list of bool): 
            A list where each sublist corresponds to a single query's ranked 
            results. Each boolean in a sublist indicates whether the result at 
            that rank is relevant (True) or not (False).

    Returns:
        float: MRR value between 0 and 1.
    """
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)
                break
    return total_score / len(relevance_total)


# DataFrame with metrics
df_metrics = pd.DataFrame({"method" : [], "hit_rate": [], "recall_at_first_pos": [], "mrr": []})

In [22]:
def evaluate(ground_truth, search_function):
    """
    Run queries through a search function and record relevance results.

    For each query in `ground_truth`, this calls `search_function` and
    produces a list of booleans indicating whether each retrieved document's
    ID matches the query's correct ID. Supports both Qdrant results and plain
    list/dict results.

    Args:
        ground_truth (list of dict): Queries with a correct 'id'.
        search_function (callable): Function returning ranked search results.

    Returns:
        list[list[bool]]: Per-query relevance lists for metric calculations.
    """
    
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        # Normalize results to a list of IDs
        if hasattr(results, "points"):  # Qdrant search result
            relevance = [d.payload["id"] == doc_id for d in results.points]
        else:  # List/dict-based search result
            relevance = [d['id'] == doc_id for d in results]

        relevance_total.append(relevance)


    return {
        'hit_rate': hit_rate(relevance_total),
        'recall_at_first_pos': recall_at_1(relevance_total),
        'mrr': mrr(relevance_total),
    }


## MinSearch

In [23]:
index = minsearch.Index(
    text_fields=["name", "summary", "cultivation", "tixicity"],
    keyword_fields=['id']
)

In [24]:
index.fit(documents)

<minsearch.minsearch.Index at 0x799aaa4a6120>

In [25]:
def minsearch_search(query):
    """
    Search the index using MinSearch.

    Executes a search on the global `index` object with the given query,
    no filters, an empty boost dictionary, and returns the top 5 results.

    Args:
        query (str): Search query string.

    Returns:
        list[dict]: Ranked search results from the index.
    """
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=5
    )

    return results

In [40]:
minsearch_metrics = evaluate(ground_truth, lambda q: minsearch_search(q['question']))
minsearch_metrics_to_add = {'method': 'minsearch', **minsearch_metrics}
minsearch_metrics_to_add

  0%|          | 0/985 [00:00<?, ?it/s]

{'method': 'minsearch',
 'hit_rate': 0.8954314720812183,
 'recall_at_first_pos': 0.8223350253807107,
 'mrr': 0.8475465313028768}

## Vector Search: TfidfVectorizer + SVD

In [27]:
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [28]:
texts = []

for doc in documents:
    t = doc['name'] + ' ' + doc['summary'] + ' ' + doc['cultivation']+ ' ' + doc['toxicity']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=1),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [29]:
vindex = VectorSearch(keyword_fields={'id'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x799aaa070a10>

In [30]:
def vector_search(query):

    query = pipeline.transform([query])
    results = vindex.search(
        query_vector=query,
        num_results=5
    )

    return results

In [31]:
vector_search_metrics = evaluate(ground_truth, lambda q: vector_search(q['question']))
vector_search_metrics_to_add = {'method': 'vector_search_tfidf_svd', **vector_search_metrics}
vector_search_metrics_to_add

  0%|          | 0/985 [00:00<?, ?it/s]

{'method': 'vector_search_tfidf_svd',
 'hit_rate': 0.9218274111675127,
 'recall_at_first_pos': 0.8010152284263959,
 'mrr': 0.8525042301184438}

## Vector Search: jina embeddings

#### Setup for Qdrant

First, install the Qdrant client with FastEmbed support:

```bash
pip install -q "qdrant-client[fastembed]>=1.14.2"
```
Then, run Qdrant in Docker:

```bash
docker pull qdrant/qdrant

docker run -p 6333:6333 -p 6334:6334 \
   -v "$(pwd)/qdrant_storage:/qdrant/storage:z" \
   qdrant/qdrant

In [32]:
from qdrant_client import QdrantClient, models
client = QdrantClient("http://localhost:6333")

In [33]:
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [34]:
# Define the collection name
collection_name = "rag-project"
if client.collection_exists(collection_name):
    client.delete_collection(collection_name)                               
# Create the collection with specified vector parameters
client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,  # Dimensionality of the vectors
        distance=models.Distance.COSINE  # Distance metric for similarity search
    )
)

True

In [35]:
points = []

for i, doc in enumerate(documents):
    text = doc['name'] + ' ' + doc['summary'] + ' ' + doc['cultivation']+ ' ' + doc['toxicity']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

In [36]:
client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [37]:
def jina_emb_search(query, limit=5):
    """
        Searches for similar documents using Jina embeddings.

        Args:
            query: The search query string.
            limit: The maximum number of results to return. Defaults to 5.

        Returns:
            A list of search results.
    """
    results = client.query_points(
        collection_name=collection_name,
        query=models.Document( 
            text=query,
            model=model_handle
        ),
        limit=limit, # top closest matches
        with_payload=True, #to get metadata in the results,

    )
    
    return results

In [38]:
vector_search_jina_emb_metrics = evaluate(ground_truth, lambda q: jina_emb_search(q['question']))
vector_search_jina_emb_metrics_to_add = {'method': 'vector_search_jina_emb_metrics', **vector_search_jina_emb_metrics}
vector_search_jina_emb_metrics_to_add

  0%|          | 0/985 [00:00<?, ?it/s]

{'method': 'vector_search_jina_emb_metrics',
 'hit_rate': 0.9279187817258884,
 'recall_at_first_pos': 0.8609137055837564,
 'mrr': 0.8867174280879867}

## Hybrid search

In [10]:
collection_name = "rag-project-sparse-and-dense"
# Create the collection with both vector types
if client.collection_exists(collection_name):
    client.delete_collection(collection_name)    
client.create_collection(
    collection_name=collection_name,
    vectors_config={
        # Named dense vector for jinaai/jina-embeddings-v2-small-en
        "jina-small": models.VectorParams(
            size=512,
            distance=models.Distance.COSINE,
        ),
    },
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    }
)

True

In [11]:
points = []

for i, doc in enumerate(documents):
    text = doc['name'] + ' ' + doc['summary'] + ' ' + doc['cultivation']+ ' ' + doc['toxicity']
    vector = {
                "jina-small": models.Document(
                    text=text,
                    model="jinaai/jina-embeddings-v2-small-en",
                ),
                "bm25": models.Document(
                    text=text, 
                    model="Qdrant/bm25",
                ),
    }
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

In [12]:
client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [13]:
def multi_stage_search(query, limit = 5):
    """
    Perform a hybrid multi-stage search combining BM25 keyword search 
    with semantic prefetch using Jina embeddings. 
    Prefetch retrieves 10× the requested results for improved reranking, 
    then returns the top matches with payloads.

    Args:
        query (str): Search query text.
        limit (int, optional): Number of final results to return. Defaults to 5.

    Returns:
        list[models.ScoredPoint]: Ranked search results with payload data.
    """
    results = client.query_points(
        collection_name=collection_name,
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="jinaai/jina-embeddings-v2-small-en",
                ),
                using="jina-small",
                # Prefetch ten times more results, then
                # expected to return, so we can really rerank
                limit=(10 * limit),
            ),
        ],
        query=models.Document(
            text=query,
            model="Qdrant/bm25", 
        ),
        using="bm25",
        limit=limit,
        with_payload=True,
    )

    return results

In [14]:
hybrid_search_metrics = evaluate(ground_truth, lambda q: multi_stage_search(q['question']))
hybrid_search_metrics_to_add = {'method': 'hybrid_search_metrics', **hybrid_search_metrics}
hybrid_search_metrics_to_add

  0%|          | 0/985 [00:00<?, ?it/s]

{'method': 'hybrid_search_metrics',
 'hit_rate': 0.9411167512690355,
 'recall_at_first_pos': 0.8710659898477158,
 'mrr': 0.9003214890016923}

In [41]:
rows = [minsearch_metrics_to_add, vector_search_metrics_to_add, vector_search_jina_emb_metrics_to_add, hybrid_search_metrics_to_add]
# Convert to DataFrame and append all at once
df_metrics = pd.concat([df_metrics, pd.DataFrame(rows)], ignore_index=True)
df_metrics

Unnamed: 0,method,hit_rate,recall_at_first_pos,mrr
0,minsearch,0.895431,0.822335,0.847547
1,vector_search_tfidf_svd,0.921827,0.801015,0.852504
2,vector_search_jina_emb_metrics,0.927919,0.860914,0.886717
3,hybrid_search_metrics,0.941117,0.871066,0.900321
