### NutriChat - Vector-Based Retrieval Evaluation

This notebook evaluates vector-based retrieval approaches:
1. Basic hit rate and MRR metrics
2. Question/Food vector search
3. Text/Measure vector search 
4. Combined vector approaches
5. Analysis of retrieval performance across different food categories for our nutrition facts dataset.

In [None]:
#Import libraries
import json
import requests
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm
import pandas as pd 

#### Data

Documents with IDs

In [None]:
# Load documents
docs_url = 'https://raw.githubusercontent.com/milanimcgraw/NutriChat/main/nutrition-docs-with-ids.json'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [None]:
documents[:5]

Ground Truth Data

In [None]:
# Load ground truth data
gt_url = 'https://raw.githubusercontent.com/milanimcgraw/NutriChat/main/nutrichat-groundtruthdata.csv'
df_ground_truth = pd.read_csv(gt_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [None]:
ground_truth[:5]

#### Initialize Sentence Transformer

In [3]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

In [6]:
# Test encode function
v = model.encode('How many calories are in an apple?')

In [None]:
# Vector dimensions
len(v)

384

#### ElasticSearch

In [11]:
es_client = Elasticsearch('http://localhost:9200')

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [None]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "Food": {"type": "text"},
            "Measure": {"type": "text"},
            "Grams": {"type": "float"},
            "Calories": {"type": "float"},
            "Protein": {"type": "float"},
            "Fat": {"type": "float"},
            "SatFat": {"type": "float"},
            "Fiber": {"type": "float"},
            "Carbs": {"type": "float"},
            "Category": {"type": "keyword"},
            "id": {"type": "keyword"},
            "food_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "measure_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "grams_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
             "calories_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }
             "protein_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
             "fat_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }, 
            "satfat_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }, 
            "fiber_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
             "carbs_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }, 
            "category_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }
        }
    }
}

index_name = "nutrition-facts"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

In [None]:
### Create and index document vectors
for doc in tqdm(documents):
    doc['food_vector'] = model.encode(str(doc['Food']))
    doc['measure_vector'] = model.encode(str(doc['Measure']))
    doc['grams_vector'] = model.encode(str(doc['Grams']))
    doc['calories_vector'] = model.encode(str(doc['Calories']))
    doc['protein_vector'] = model.encode(str(doc['Protein']))
    doc['fat_vector'] = model.encode(str(doc['Fat']))
    doc['satfat_vector'] = model.encode(str(doc['SatFat']))
    doc['fiber_vector'] = model.encode(str(doc['Fiber']))
    doc['carbs_vector'] = model.encode(str(doc['Carbs']))
    doc['category_vector'] = model.encode(str(doc['Category']))
    
    es_client.index(index=index_name, document=doc)

In [15]:
query = 'What are the lowest calorie options in dairy products?'

In [16]:
v_q = model.encode(query)

#### Vector Search Functions

In [29]:
def elastic_search_knn(field, vector):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
    }
    

    search_query = {
        "knn": knn,
        "_source": ["Food", "Measure", "Grams", "Calories", "Protein", "Fat", "SatFat", "Fiber", "Carbs", "Category", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [None]:
def food_vector_knn(q):
    v_q = model.encode(q)
    return elastic_search_knn('food_vector', v_q)

def measure_vector_knn(q):
    v_q = model.encode(q)
    return elastic_search_knn('measure_vector', v_q)

def grams_vector_knn(q):
    v_q = model.encode(q)
    return elastic_search_knn('grams_vector', v_q)

def calories_vector_knn(q):
    v_q = model.encode(q)
    return elastic_search_knn('calories_vector', v_q)

def protein_vector_knn(q):
    v_q = model.encode(q)
    return elastic_search_knn('protein_vector', v_q)

def fat_vector_knn(q):
    v_q = model.encode(q)
    return elastic_search_knn('fat_vector', v_q)

def satfat_vector_knn(q):
    v_q = model.encode(q)
    return elastic_search_knn('satfat_vector', v_q)

def fiber_vector_knn(q):
    v_q = model.encode(q)
    return elastic_search_knn('fiber_vector', v_q)

def carbs_vector_knn(q):
    v_q = model.encode(q)
    return elastic_search_knn('carbs_vector', v_q)

def category_vector_knn(q):
    v_q = model.encode(q)
    return elastic_search_knn('category_vector', v_q)

In [None]:
def elastic_search_knn_combined(vector):
    search_query = {
        "size": 5,
        "query": {
            "script_score": {
                "query": {"match_all": {}},
                "script": {
                    "source": """
                        cosineSimilarity(params.query_vector, 'food_vector') +
                        cosineSimilarity(params.query_vector, 'measure_vector') +
                        cosineSimilarity(params.query_vector, 'grams_vector') +
                        cosineSimilarity(params.query_vector, 'calories_vector') +
                        cosineSimilarity(params.query_vector, 'protein_vector') +
                        cosineSimilarity(params.query_vector, 'fat_vector') +
                        cosineSimilarity(params.query_vector, 'satfat_vector') +
                        cosineSimilarity(params.query_vector, 'fiber_vector') +
                        cosineSimilarity(params.query_vector, 'carbs_vector') +
                        cosineSimilarity(params.query_vector, 'category_vector') +
                        1
                    """,
                    "params": {
                        "query_vector": vector
                    }
                }
            }
        },
        "_source": ["Food", "Measure", "Grams", "Calories", "Protein", "Fat", "SatFat", "Fiber", "Carbs", "Category", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [None]:
def vector_combined_knn(q):
    v_q = model.encode(q)
    
    return elastic_search_knn_combined(v_q)

print("\nCombined Vector Search Results:")
combined_results = evaluate(ground_truth, vector_combined_knn)
print(combined_results)

#### Test Search

In [None]:
query = {'question': 'How many calories are in an apple?'}

results = food_vector_knn(query)

print("\nResults:", results)

#### Evaluation Functions

Hit Rate

In [37]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

MRR

In [38]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

Ground Truth

In [39]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

### Evaluations & Results

**Single Query Tests**
- Tests how different vector searches perform on a specific question "How many calories are in an apple?" by showing actual retrieved results

In [None]:
query = 'How many calories are in an apple?'

print("\nFood Vector Results:")
results = food_vector_knn(query)
print(results)

print("\nMeasure Vector Results:")
results = measure_vector_knn(query)
print(results)

print("\nGrams Vector Results:")
results = grams_vector_knn(query)
print(results)

print("\nCalories Vector Results:")
results = calories_vector_knn(query)
print(results)

print("\nProtein Vector Results:")
results = protein_vector_knn(query)
print(results)

print("\nFat Vector Results:")
results = fat_vector_knn(query)
print(results)

print("\nSaturated Fat Vector Results:")
results = satfat_vector_knn(query)
print(results)

print("\nFiber Vector Results:")
results = fiber_vector_knn(query)
print(results)

print("\nCarbs Vector Results:")
results = carbs_vector_knn(query)
print(results)

print("\nCategory Vector Results:")
results = category_vector_knn(query)
print(results)

print("\nCombined Vector Results:")
results = vector_combined_knn(query)
print(results)

In [None]:
query = 'How many calories are in an apple?'

query_results = []

# Collect results from each vector type
vector_types = {
    'Food Vector': food_vector_knn,
    'Measure Vector': measure_vector_knn,
    'Grams Vector': grams_vector_knn,
    'Calories Vector': calories_vector_knn,
    'Protein Vector': protein_vector_knn,
    'Fat Vector': fat_vector_knn,
    'Saturated Fat Vector': satfat_vector_knn,
    'Fiber Vector': fiber_vector_knn,
    'Carbs Vector': carbs_vector_knn,
    'Category Vector': category_vector_knn,
    'Combined Vector': vector_combined_knn
}

# Get results for each vector type
for vector_name, vector_func in vector_types.items():
    results = vector_func(query)
    for result in results:
        result['Vector_Type'] = vector_name
    query_results.extend(results)

# Convert to DataFrame
query_df = pd.DataFrame(query_results)

# Display results
print("\nQuery Results DataFrame:")
print(query_df)

**Ground Truth Performance**
- Evaluates each vector type's retrieval accuracy against known correct answers using hit rate and MRR metrics 

In [None]:
print("\nFood Vector Results:")
print(evaluate(ground_truth, food_vector_knn))

print("\nMeasure Vector Results:")
print(evaluate(ground_truth, measure_vector_knn))

print("\nGrams Vector Results:")
print(evaluate(ground_truth, grams_vector_knn))

print("\nCalories Vector Results:")
print(evaluate(ground_truth, calories_vector_knn))

print("\nProtein Vector Results:")
print(evaluate(ground_truth, protein_vector_knn))

print("\nFat Vector Results:")
print(evaluate(ground_truth, fat_vector_knn))

print("\nSaturated Fat Vector Results:")
print(evaluate(ground_truth, satfat_vector_knn))

print("\nFiber Vector Results:")
print(evaluate(ground_truth, fiber_vector_knn))

print("\nCarbs Vector Results:")
print(evaluate(ground_truth, carbs_vector_knn))

print("\nCategory Vector Results:")
print(evaluate(ground_truth, category_vector_knn))

In [None]:
# Create list to store evaluation results
eval_results = []

# Collect evaluation metrics for each vector type
for vector_name, vector_func in vector_types.items():
    metrics = evaluate(ground_truth, vector_func)
    eval_results.append({
        'Vector_Type': vector_name,
        'Hit_Rate': metrics['hit_rate'],
        'MRR': metrics['mrr']
    })

# Convert to DataFrame
eval_df = pd.DataFrame(eval_results)

# Display results
print("\nGround Truth Evaluation DataFrame:")
print(eval_df)

**Vector Performance Comparison**
- Provides a condensed side-by-side comparison of how each vector type performs against ground truth

In [None]:
# Results comparison
print("\nFinal Comparison:")

comparison_data = {
    'Vector_Type': [],
    'Hit_Rate': [],
    'MRR': []
}
print("Food Vector:", evaluate(ground_truth, food_vector_knn))
print("Measure Vector:", evaluate(ground_truth, measure_vector_knn))
print("Grams Vector:", evaluate(ground_truth, grams_vector_knn))
print("Calories Vector:", evaluate(ground_truth, calories_vector_knn))
print("Protein Vector:", evaluate(ground_truth, protein_vector_knn))
print("Fat Vector:", evaluate(ground_truth, fat_vector_knn))
print("SatFat Vector:", evaluate(ground_truth, satfat_vector_knn))
print("Fiber Vector:", evaluate(ground_truth, fiber_vector_knn))
print("Carbs Vector:", evaluate(ground_truth, carbs_vector_knn))
print("Category Vector:", evaluate(ground_truth, category_vector_knn))
print("Combined:", combined_results)

In [None]:
# Create a new DataFrame specifically for comparison results
comparison_results = {
    'Vector_Type': ['Food Vector', 'Measure Vector', 'Grams Vector', 'Calories Vector', 
                    'Protein Vector', 'Fat Vector', 'SatFat Vector', 'Fiber Vector', 
                    'Carbs Vector', 'Category Vector', 'Combined'],
    'Hit_Rate': [],
    'MRR': []
}

comparison_results['Hit_Rate'].extend([
    evaluate(ground_truth, food_vector_knn)['hit_rate'],
    evaluate(ground_truth, measure_vector_knn)['hit_rate'],
    evaluate(ground_truth, grams_vector_knn)['hit_rate'],
    evaluate(ground_truth, calories_vector_knn)['hit_rate'],
    evaluate(ground_truth, protein_vector_knn)['hit_rate'],
    evaluate(ground_truth, fat_vector_knn)['hit_rate'],
    evaluate(ground_truth, satfat_vector_knn)['hit_rate'],
    evaluate(ground_truth, fiber_vector_knn)['hit_rate'],
    evaluate(ground_truth, carbs_vector_knn)['hit_rate'],
    evaluate(ground_truth, category_vector_knn)['hit_rate'],
    combined_results['hit_rate']
])

comparison_results['MRR'].extend([
    evaluate(ground_truth, food_vector_knn)['mrr'],
    evaluate(ground_truth, measure_vector_knn)['mrr'],
    evaluate(ground_truth, grams_vector_knn)['mrr'],
    evaluate(ground_truth, calories_vector_knn)['mrr'],
    evaluate(ground_truth, protein_vector_knn)['mrr'],
    evaluate(ground_truth, fat_vector_knn)['mrr'],
    evaluate(ground_truth, satfat_vector_knn)['mrr'],
    evaluate(ground_truth, fiber_vector_knn)['mrr'],
    evaluate(ground_truth, carbs_vector_knn)['mrr'],
    evaluate(ground_truth, category_vector_knn)['mrr'],
    combined_results['mrr']
])

comparison_df = pd.DataFrame(comparison_results)

print("\nComparison Results DataFrame:")
print(comparison_df)