### NutriChat - RAG System Evaluation

This notebook evaluates our RAG system performance using:
1. Cosine similarity between generated and original answers
2. LLM as a judge for response quality
3. Analysis across different food categories
4. Performance comparison across different LLMs ('GPT-4o, GPT-4o mini, GPT-3.5)

#### Imports

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import json
import requests
import seaborn as sns
from openai import OpenAI
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
from concurrent.futures import ThreadPoolExecutor

#### Data

Documents with IDs 

In [None]:
# Load data
docs_url = 'https://raw.githubusercontent.com/milanimcgraw/NutriChat/main/nutrition-docs-with-ids.json'
docs_response = requests.get(docs_url)
documents = docs_response.json()


In [None]:
documents[:5]

In [None]:
doc_idx = {d['id']: d for d in documents}

Ground Truth

In [3]:
# Load ground truth data
gt_url = 'https://raw.githubusercontent.com/milanimcgraw/NutriChat/main/nutrichat-groundtruthdata.csv'
df_ground_truth = pd.read_csv(gt_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [None]:
ground_truth[:5]

In [None]:
len(ground_truth)

In [None]:
rec

In [None]:
doc_idx = {d['id']: d for d in documents}
# doc_idx['5170565b']['text']

#### Initialize models

Sentence Transformer

In [4]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

OpenAI 

In [None]:
client = OpenAI()

ElasticSearch

In [None]:
es_client = Elasticsearch('http://localhost:9200') 

#### Indexing

In [None]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "Food": {"type": "text"},
            "Measure": {"type": "text"},
            "Grams": {"type": "float"},
            "Calories": {"type": "float"},
            "Protein": {"type": "float"},
            "Fat": {"type": "float"},
            "Sat.Fat": {"type": "float"},
            "Fiber": {"type": "float"},
            "Carbs": {"type": "float"},
            "Category": {"type": "keyword"},
            "id": {"type": "keyword"},
            "food_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "measure_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "grams_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
             "calories_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
             "protein_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
             "fat_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }, 
            "satfat_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }, 
            "fiber_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
             "carbs_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }, 
            "category_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            }
        }
    }
}

index_name = "nutrition-facts"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

In [None]:
### Create and index document vectors
for doc in tqdm(documents):
    doc['food_vector'] = model.encode(str(doc['Food']))
    doc['measure_vector'] = model.encode(str(doc['Measure']))
    doc['grams_vector'] = model.encode(str(doc['Grams']))
    doc['calories_vector'] = model.encode(str(doc['Calories']))
    doc['protein_vector'] = model.encode(str(doc['Protein']))
    doc['fat_vector'] = model.encode(str(doc['Fat']))
    doc['satfat_vector'] = model.encode(str(doc['SatFat']))
    doc['fiber_vector'] = model.encode(str(doc['Fiber']))
    doc['carbs_vector'] = model.encode(str(doc['Carbs']))
    doc['category_vector'] = model.encode(str(doc['Category']))
    
    es_client.index(index=index_name, document=doc)

#### Retrieval

In [None]:
def elastic_search_knn(field, vector):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
    }
    

    search_query = {
        "knn": knn,
        "_source": ["Food", "Measure", "Grams", "Calories", "Protein", "Fat", "SatFat", "Fiber", "Carbs", "Category", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [None]:
# Category Vector
def category_vector_knn(q):
    """Search using category vector"""
    question = q['question']
    category = q['category']
    v_q = model.encode(question)
    return elastic_search_knn('category_vector', v_q, category)

In [None]:
# Test  
category_vector_knn(dict(
    question='Show me low-calorie dairy options',
    category='Dairy products'
))

#### RAG Pipeline Functions

In [None]:
# Build Prompt 
def build_prompt(query, search_results):
    prompt_template = """
    You're a nutritionist working as a nutrition facts chat assistant. Answer the QUESTION based on the CONTEXT from the nutrition data.
    Use only the facts from the CONTEXT when answering the QUESTION. Be specific about measurements and values.

    QUESTION: {question}

    CONTEXT: 
    {context}
    """.strip()

    context = ""
    for doc in search_results:
        context = context + f"Food: {doc['Food']}\n"
        context = context + f"Measure: {doc['Measure']}\n"
        context = context + f"Nutritional Information: {doc['Calories']} calories, {doc['Protein']}g protein, "
        context = context + f"{doc['Fat']}g fat, {doc['SatFat']}g saturated fat, "
        context = context + f"{doc['Fiber']}g fiber, {doc['Carbs']}g carbohydrates\n"
        context = context + f"Category: {doc['Category']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [115]:
# LLM Function
# client = OpenAI() --already initialized
def llm(prompt, model='gpt-4o'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [None]:
# Pipeline 
def rag(query: dict, model='gpt-4o') -> str:
    """Full RAG pipeline"""
    search_results = category_vector_knn(query) 
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt, model=model)
    return answer

In [None]:
ground_truth[10]

In [None]:
rag(ground_truth[10])

In [None]:
# doc_idx['5170565b']['text']

#### Cosine Similarity Metric

>A->Q->A' cosine similarity,
>A -> Q -> A',
>cosine(A, A')

In [None]:
code block = """answer_orig = 'Yes, sessions are recorded if you miss one. Everything is recorded, allowing you to catch up on any missed content. Additionally, you can ask questions in advance for office hours and have them addressed during the live stream. You can also ask questions in Slack.'
answer_llm = 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.'"""
    
v_llm = model.encode(answer_llm)
v_orig = model.encode(answer_orig)
    
return v_llm.dot(v_orig)

In [None]:
answers = {}

In [None]:
for i, rec in enumerate(tqdm(ground_truth)):
    if i in answers:
        continue

    answer_llm = rag(rec)
    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['text']

    answers[i] = {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'document': doc_id,
        'question': rec['question'],
        'category': rec['category'],
    }
    

##### **GPT-4o**

In [107]:
results_gpt4o = [None] * len(ground_truth)

for i, val in answers.items():
    results_gpt4o[i] = val.copy()
    results_gpt4o[i].update(ground_truth[i])

In [95]:
import pandas as pd

In [110]:
df_gpt4o = pd.DataFrame(results_gpt4o)

In [112]:
!mkdir Results

In [113]:
df_gpt4o.to_csv('Results/nutrichat-gpt4oevalresults.csv', index=False)

##### **GPT 3.5 Turbo**

In [None]:
rag(ground_truth[10], model='gpt-3.5-turbo')

In [120]:
pool = ThreadPoolExecutor(max_workers=6)

def map_progress(pool, seq, f):
    results = []

    with tqdm(total=len(seq)) as progress:
        futures = []

        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)

    return results

In [121]:
def process_record(rec):
    model = 'gpt-3.5-turbo'
    answer_llm = rag(rec, model=model)
    
    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['text']

    return {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'document': doc_id,
        'question': rec['question'],
        'category': rec['category'],
    }

In [None]:
process_record(ground_truth[10])

In [None]:
results_gpt35 = map_progress(pool, ground_truth, process_record)

In [124]:
df_gpt35 = pd.DataFrame(results_gpt35)
df_gpt35.to_csv('Results/nutrichat-gpt35evalresults.csv', index=False)

In [None]:
!head Results/nutrichat-gpt35evalresults.csv

### Cosine Similarity (Eval)

##### **Evaluating GPT-4o**

In [128]:
nutrichat-gpt4oevalresults.csv = df_gpt4o.to_dict(orient='records')

In [130]:
record = nutrichat-gpt4oevalresults.csv[0]

In [134]:
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = model.encode(answer_llm)
    v_orig = model.encode(answer_orig)
    
    return v_llm.dot(v_orig)

In [135]:
similarity = []

for record in tqdm(results_gpt4o):
    sim = compute_similarity(record)
    similarity.append(sim)

  0%|          | 0/1830 [00:00<?, ?it/s]

In [None]:
df_gpt4o['cosine'] = similarity
df_gpt4o['cosine'].describe()

In [None]:
import seaborn as sns

##### **Evaluating GPT-3.5-turbo**

In [None]:
results_gpt35 = df_gpt35.to_dict(orient='records')

similarity_35 = []

for record in tqdm(results_gpt35):
    sim = compute_similarity(record)
    similarity_35.append(sim)

In [None]:
df_gpt35['cosine'] = similarity_35
df_gpt35['cosine'].describe()

In [149]:
import matplotlib.pyplot as plt

##### **GPT-4o-mini**

In [151]:
def process_record_4o_mini(rec):
    model = 'gpt-4o-mini'
    answer_llm = rag(rec, model=model)
    
    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_orig = original_doc['text']

    return {
        'answer_llm': answer_llm,
        'answer_orig': answer_orig,
        'document': doc_id,
        'question': rec['question'],
        'category': rec['category'],
    }

In [None]:
process_record_4o_mini(ground_truth[10])

In [None]:
results_gpt4omini = []

In [None]:
for record in tqdm(ground_truth):
    result = process_record_4o_mini(record)
    results_gpt4omini.append(result)

In [160]:
df_gpt4o_mini = pd.DataFrame(results_gpt4omini)
df_gpt4o_mini.to_csv('Results/nutrichat-gpt4ominievalresults.csv', index=False)

In [None]:
similarity_4o_mini = []

for record in tqdm(results_gpt4omini):
    sim = compute_similarity(record)
    similarity_4o_mini.append(sim)

In [None]:
df_gpt4o_mini['cosine'] = similarity_4o_mini
df_gpt4o_mini['cosine'].describe()

gpt4o 

```
count    1830.000000
mean        0.679129
std         0.217995
min        -0.153426
25%         0.591460
50%         0.734788
75%         0.835390
max         0.995339
Name: cosine, dtype: float64
```

In [None]:
# sns.distplot(df_gpt35['cosine'], label='3.5')

sns.distplot(df_gpt4o['cosine'], label='4o')
sns.distplot(df_gpt4o_mini['cosine'], label='4o-mini')

plt.title("RAG LLM performance")
plt.xlabel("A->Q->A' Cosine Similarity")
plt.legend()

## LLM-as-a-Judge

In [234]:
prompt1_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: {answer_orig}
Generated Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the original
answer and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

prompt2_template = """
You are an expert evaluator for a nutrition facts Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Consider:
1. Nutritional accuracy
2. Completeness of information
3. Direct answer to the question asked

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}
""".strip()


In [169]:
df_sample = df_gpt4o_mini.sample(n=150, random_state=1)

In [173]:
samples = df_sample.to_dict(orient='records')

In [None]:
record = samples[0]
record

In [None]:
prompt = prompt1_template.format(**record)
print(prompt)

In [210]:
answer = llm(prompt, model='gpt-4o-mini')

In [187]:
import json

In [224]:
evaluations = []

for record in tqdm(samples):
    prompt = prompt1_template.format(**record)
    evaluation = llm(prompt, model='gpt-4o-mini')
    evaluations.append(evaluation)

  0%|          | 0/150 [00:00<?, ?it/s]

In [225]:
json_evaluations = []

for i, str_eval in enumerate(evaluations):
    json_eval = json.loads(str_eval)
    json_evaluations.append(json_eval)

In [227]:
df_evaluations = pd.DataFrame(json_evaluations)

In [None]:
df_evaluations.Relevance.value_counts()

In [None]:
df_evaluations[df_evaluations.Relevance == 'NON_RELEVANT'] #.to_dict(orient='records')

In [None]:
sample[4]

In [None]:
prompt = prompt2_template.format(**record)
print(prompt)

In [None]:
evaluation = llm(prompt, model='gpt-4o-mini')
print(evaluation)

In [None]:
evaluations_2 = []

for record in tqdm(samples):
    prompt = prompt2_template.format(**record)
    evaluation = llm(prompt, model='gpt-4o-mini')
    evaluations_2.append(evaluation)

In [238]:
json_evaluations_2 = []

for i, str_eval in enumerate(evaluations_2):
    json_eval = json.loads(str_eval)
    json_evaluations_2.append(json_eval)

In [239]:
df_evaluations_2 = pd.DataFrame(json_evaluations_2)

In [None]:
df_evaluations_2[df_evaluations_2.Relevance == 'NON_RELEVANT']

In [None]:
samples[45]

## Saving all the data

In [244]:
df_gpt4o.to_csv('Results/nutrichat-gpt4oevalresults.csv', index=False)
df_gpt35.to_csv('Results/nutrichat-gpt35evalresults.csv', index=False)
df_gpt4o_mini.to_csv('Results/nutrichat-gpt4ominievalresults.csv', index=False)

In [245]:
df_evaluations.to_csv('Results/nutrichatevaluations-aqa.csv', index=False)
df_evaluations_2.to_csv('Results/nutrichatevaluations-qa.csv', index=False)