#### Imports

In [None]:
from elasticsearch import Elasticsearch
import requests 
from tqdm.auto import tqdm
from openai import OpenAI

#### Load Data

In [None]:
docs_url = 'https://raw.githubusercontent.com/milanimcgraw/NutriChat/refs/heads/main/nutritionfacts.json'
docs_response = requests.get(docs_url)
nutrition_data = docs_response.json()

In [None]:
documents = []

for item in nutrition_data['nutritionfacts']:
    documents.append(item)

documents[0]

#### ElasticSearch

In [8]:
es_client = Elasticsearch('http://localhost:9200') 

In [None]:
# Define index 
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "Food": {"type": "text"},
            "Measure": {"type": "text"},
            "Grams": {"type": "float"},
            "Calories": {"type": "float"},
            "Protein": {"type": "float"},
            "Fat": {"type": "float"},
            "SatFat": {"type": "float"},
            "Fiber": {"type": "float"},
            "Carbs": {"type": "float"},
            "Category": {"type": "keyword"}
        }
    }
}

index_name = "nutrition-facts"

# Create Index
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

In [13]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [17]:
# Elasticsearch Search
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": [
                            "Food^3",
                            "Measure",
                            "Grams",
                            "Calories",
                            "Protein",
                            "Fat",
                            "SatFat",
                            "Fiber",
                            "Carbs",
                            "Category"
                        ],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "Category": category
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    

In [None]:
client = OpenAI()

response = client.chat.completions.create(
    model='gpt-4',
    messages=[{"role": "user", "content": query}]
)


response.choices[0].message.content

In [18]:
# Build Prompt
def build_prompt(query, search_results):
    prompt_template = """
    You're a nutritionist working as a nutrition facts chat assistant. Answer the QUESTION based on the CONTEXT from the nutrition data.
    Use only the facts from the CONTEXT when answering the QUESTION. Be specific about measurements and values.

    QUESTION: {question}

    CONTEXT: 
    {context}
    """.strip()

    context = ""
    for doc in search_results:
        context = context + f"Food: {doc['Food']}\n"
        context = context + f"Measure: {doc['Measure']}\n"
        context = context + f"Nutritional Information: {doc['Calories']} calories, {doc['Protein']}g protein, "
        context = context + f"{doc['Fat']}g fat, {doc['SatFat']}g saturated fat, "
        context = context + f"{doc['Fiber']}g fiber, {doc['Carbs']}g carbohydrates\n"
        context = context + f"Category: {doc['Category']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [None]:
# LLM Function
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

#### Test Pipeline

In [19]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer, search_results

query = 'Which fruits have the most fiber?'

rag(query)

In [None]:
query = 'What are the nutritional facts for whole milk??'

rag(query)

In [None]:
query = 'Compare the protein content between chicken and fish'

rag(query)

In [None]:
query = 'Show me low-calorie dairy options'

rag(query)