### NutriChat - Ground Truth Data Generation

This notebook generates ground truth data for evaluating our nutrition facts RAG system. The process includes:
1. Creating unique document IDs for nutrition records
2. Generating diverse nutrition-related questions using LLM
3. Building question-answer pairs for evaluation
4. Creating comprehensive dataset covering all food categories

In [None]:
# Import Libraries
import requests
import json
import hashlib
from openai import OpenAI
import pandas as pd
from tqdm.auto import tqdm
from collections import defaultdict
import pickle

In [1]:
# Load nutrition data
docs_url = 'https://raw.githubusercontent.com/milanimcgraw/NutriChat/refs/heads/main/nutritionfacts.json'
docs_response = requests.get(docs_url)
nutrition_data = docs_response.json()

# Convert to list of documents
documents = nutrition_data['nutritionfacts']


In [21]:
# Generate document IDs
def generate_document_id(doc):
    combined = f"{doc['Food']}-{doc['Measure']}-{doc['Grams']}-{doc['Calories']}-{doc['Protein']}-{doc['Fat']}-{doc['Sat.Fat']}-{doc['Fiber']}-doc{['Carbs']}-{doc['Category']}
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [22]:
# Add IDs to documents
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [None]:
documents[5]

In [24]:
# Check for duplicates
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [25]:
print(f"Total documents: {len(documents)}")
print(f"Unique document IDs: {len(hashes)}")

(947, 948)

In [26]:
# Checking for collisions
for k, values in hashes.items():
    if len(values) > 1:
        print(f"Collision found for ID {k}: {len(values)} documents")

for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

593f7569 2


In [None]:
hashes['593f7569']

In [33]:
# Save documents with IDs
with open('nutrichat-docs-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [34]:
!head nutrichat-docs-with-ids.json

[
  {
    "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
    "section": "General course-related questions",
    "question": "Course - When will the course start?",
    "course": "data-engineering-zoomcamp",
    "id": "c02e79ef"
  },
  {
    "text": "GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites",


In [35]:
# Define prompt template for generating questions
prompt_template = """
You are a user interested in nutrition information.
Generate 5 different ways to ask about different food items and it's nutritional content.
Make questions specific and varied (e.g., about calories, protein, comparisons).

Food Item:
Name: {Food}
Category: {Category}
Measure: {Measure}
Nutritional Info: {Calories} calories, {Grams}g grams, {Protein}g protein, {Fat}g fat, {Sat.Fat}g saturated fat, {Fiber}g fiber, {Carbs}g carbs

Questions should be natural and diverse. Include questions about:
1. General nutritional content
2. Specific nutrients
3. Serving sizes
4. Comparisons within category
5. Health-related inquiries

Output format (JSON array):
["question1", "question2", "question3", "question4", "question5"]
""".strip()

In [38]:
client = OpenAI()

In [None]:
# Generate questions function
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [49]:
# Generate questions for each document
results = {}

for doc in tqdm(documents):
    doc_id = doc['id']
    if doc_id in results:
        continue
        
    questions = generate_questions(doc)
    if questions:
        results[doc_id] = questions

In [53]:
with open('results.bin', 'rb') as f_in:
    results = pickle.load(f_in)

In [55]:
results['1f6520ca']

'["Where can I find the prerequisites for this course?", "How do I check the prerequisites for this course?", "Where are the course prerequisites listed?", "What are the requirements for joining this course?", "Where is the list of prerequisites for the course?"]'

In [65]:
# Parse results
parsed_results = {}

for doc_id, json_questions in results.items():
    parsed_results[doc_id] = json.loads(json_questions)

In [67]:
# Create document index
doc_index = {d['id']: d for d in documents}

In [None]:
# Create final results
final_results = []

for doc_id, questions in parsed_results.items():
    doc = doc_index[doc_id]
    
    for q in questions:
        final_results.append({
            'question': q,
            'food': doc['Food'],
            'measure': doc['Measure'],
            'grams': doc['Grams'],
            'calories': doc['Calories'],
            'protein': doc['Protein'],
            'fat': doc['Fat'],
            'sat.fat': doc['Sat.Fat'],
            'fiber': doc['Fiber'],
            'carbs': doc['Carbs'],
            'category': doc['Category'],
            'document_id': doc_id
        })

In [None]:
# Convert to DataFrame and save
df = pd.DataFrame(final_results, columns=['question', 'food', 'measure', 'grams', 'calories', 'protein', 'fat', 'sat.fat', 'fiber', 'carbs', 'category', 'document_id'])
df.to_csv('nutrichat-groundtruthdata.csv', index=False)

In [None]:
# Display sample
print("\nSample ground truth data:")
print(df.head(10))

In [None]:
# Print statistics
print("\nGround Truth Dataset Statistics:")
print(f"Total questions generated: {len(df)}")
print(f"Unique food items: {df['food'].nunique()}")
print(f"Categories covered: {df['category'].nunique()}")

In [None]:
# Display category distribution
print("\nQuestions per category:")
print(df['category'].value_counts())