### NutriChat - Hybrid Search Evaluation

This notebook implements and evaluates hybrid search combining:
1. Text-based search (BM25)
2. Vector search
3. Combined scoring approach

#### Imports

In [1]:
import json
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
from langchain.embeddings import SentenceTransformerEmbeddings
from typing import Dict
from langchain_elasticsearch import ElasticsearchRetriever

#### Load Data

In [2]:
# Load documents
docs_url = 'https://raw.githubusercontent.com/milanimcgraw/NutriChat/main/nutrition-docs-with-ids.json'
docs_response = requests.get(docs_url)
documents = docs_response.json()

#### Load Model

In [3]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)



## Indexing stage

In [4]:
for doc in tqdm(documents):
    food_text = doc['Food']
    measure_text = doc['Measure']
    grams_text = doc['Grams']
    calories_text = ['Calories'] 
    protein_text = ['Protein']
    fat_text = ['Fat']
    satfat_text = ['SatFat']
    fiber_text = ['Fiber']
    carbs_text = ['Carbs']
    category_text = ['Category']
    nutrition_text = f"  {doc['Food']} food, {doc['Measure']} measure, {doc['Grams']} grams, {doc['Calories']} calories, {doc['Protein']}g protein, {doc['Fat']}g fat, {doc['SatFat']}g satfat, {doc['Fiber']}g fiber, doc{['Carbs']}g carbs, {doc['Category']} category"
    full_text = f"{food_text} {measure_text} {grams_text} {calories_text} {protein_text} {fat_text} {satfat_text} {fiber_text} {category }{doc['Category']}"


# Generate vectors
for doc in tqdm(documents):
    doc['food_vector'] = model.encode(food_text)
    doc['measure_vector'] = model.encode(meeasure_text)
    doc['grams_vector'] = model.encode(grams_text)
    doc['calories_vector'] = model.encode(calories_text)
    doc['protein_vector'] = model.encode(protein_text)
    doc['fat_vector'] = model.encode(fat_text)
    doc['satfat_vector'] = model.encode(satfat_text)
    doc['fiber_vector'] = model.encode(fiber_text)
    doc['carbs_vector'] = model.encode(carbs_text) 
    doc['category_vector'] = model.encode(category_text)
    doc['full_vector'] = model.encode(full_text)

  0%|          | 0/948 [00:00<?, ?it/s]

#### ElasticSearch

In [5]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "Food": {"type": "text"},
            "Measure": {"type": "text"},
            "Grams": {"type": "float"},
            "Calories": {"type": "float"},
            "Protein": {"type": "float"},
            "Fat": {"type": "float"},
            "SatFat": {"type": "float"},
            "Fiber": {"type": "float"},
            "Carbs": {"type": "float"},
            "Category": {"type": "keyword"},
            "id": {"type": "keyword"},
            "food_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "measure_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "grams_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
             "calories_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }
             "protein_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
             "fat_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }, 
            "satfat_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }, 
            "fiber_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
             "carbs_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }, 
            "category_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }, 
            "full_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
        }
    }
}

index_name = "nutrition-facts"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [6]:
#Index documents
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

#### Retrieval stage

In [8]:
es_url = 'http://localhost:9200'

In [9]:
# Test query
test_query = {
    'question': 'How many calories in an apple?',
    'category': 'Fruits A-F'
}

In [15]:
# Embeddings
embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/multi-qa-MiniLM-L6-cos-v1")

In [21]:
def hybrid_query(search_query: str) -> Dict:
    vector = embeddings.embed_query(search_query)
    return {
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": search_query,
                        "fields": [
                            "Food^3",
                            "Measure",
                            "Grams",
                            "Calories",
                            "Protein",
                            "Fat",
                            "SatFat",
                            "Fiber",
                            "Carbs",
                            "Category"
                        ],
                        "type": "best_fields",
                        "boost": 0.5,
                    }
                },
                "filter": {
                    "term": {
                        "Category": category
                    }
                }
            }
        },
        "knn": {
            "field": "full_vector",
            "query_vector": vector,
            "k": 5,
            "num_candidates": 10000,
            "boost": 0.5,
            "filter": {
                "term": {
                    "Category": category
                }
            }
        },
        "size": 5
    }


hybrid_retriever = ElasticsearchRetriever.from_es_params(
    index_name=index_name,
    body_func=hybrid_query,
    content_field='text',
    url=es_url,
)

In [22]:
hybrid_results = hybrid_retriever.invoke(query)

In [23]:
for result in hybrid_results:
   print(result.metadata['_source'])

Course - Can I still join the course after the start date? data-engineering-zoomcamp 12.559245
Course - Can I follow the course after it finishes? data-engineering-zoomcamp 9.39959
Course - What can I do before the course starts? data-engineering-zoomcamp 7.306914
Course - Can I get support if I take the course in the self-paced mode? data-engineering-zoomcamp 7.1085525
Course - When will the course start? data-engineering-zoomcamp 6.7513986


### Hybrid Search

In [24]:
df_ground_truth = pd.read_csv('nutrichat-groundtruthdata.csv')

In [25]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [34]:
def elastic_search_hybrid(field, query, category):
    def query_builder(search_query: str) -> Dict:
        vector = embeddings.embed_query(search_query)
        return {
            "query": {
                "bool": {
                    "must": {
                        "multi_match": {
                            "query": search_query,
                            "fields": [
                                "Food^3",
                                "Measure",
                                "Grams",
                                "Calories",
                                "Protein",
                                "Fat",
                                "SatFat",
                                "Fiber",
                                "Carbs",
                                "Category"
                            ],
                            "type": "best_fields",
                            "boost": 0.5,
                        }
                    },
                    "filter": {
                        "term": {
                            "Category": category
                        }
                    }
                }
            },
            "knn": {
                "field": field,
                "query_vector": vector,
                "k": 5,
                "num_candidates": 10000,
                "boost": 0.5,
                "filter": {
                    "term": {
                        "Category": category
                    }
                }
            },
            "size": 5,
            "_source": ["Food", "Measure", "Calories", "Protein", "Fat", "SatFat", "Fiber", "Carbs", "Category", "id"]
        }
    
    
    
    hybrid_retriever = ElasticsearchRetriever.from_es_params(
        index_name=index_name,
        body_func=hybrid_query,
        content_field='Food',
        url=es_url,
    )

    hybrid_results = hybrid_retriever.invoke(query)
    
    result_docs = []
    
    for hit in hybrid_results:
        result_docs.append(hit.metadata['_source'])

    return result_docs

### Evaluation

Hit Rate

In [None]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

MRR

In [None]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

#### Load Ground Truth

In [None]:
gt_url = 'https://raw.githubusercontent.com/milanimcgraw/NutriChat/main/nutrichat-groundtruthdata.csv'
df_ground_truth = pd.read_csv(gt_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [35]:
ground_truth[0]

{'question': 'When does the course begin?',
 'course': 'data-engineering-zoomcamp',
 'document': 'c02e79ef'}

#### Run Hybrid Search

In [37]:
def hybrid_search(q):
    question = q['question']
    category = q['category']
    return elastic_search_hybrid('full_vector', question, category)

In [38]:
def evaluate(ground_truth, search_function):
    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = q['document_id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

#### Results

In [None]:
# Evaluate
print("\nHybrid Search Results:")
hybrid_results = evaluate(ground_truth, hybrid_search)
print(hybrid_results)

Comparison

In [None]:
print("\nComparison with other approaches:")
print("Text Search:", "Previous text search results")  # Add text search results
print("Vector Search:", "Previous vector search results")  # Add vector search results
print("Hybrid Search:", hybrid_results)

Hybrid search with ES: `{'hit_rate': 0.9250054030689432, 'mrr': 0.8506231539514445}`