# Project - Crave Control Agent
the project is split between 5 notebooks:
1. Preproccesing of the recipes
2. Embeddings creation & pinecone
3. Reviews - preproccesing, LLM summerazation & embeddings creation
4. Prompt template, Calculation of top recepies & LLM chain
5. User friendly - ready to use notebook
User interface, data preproccesing, models initialization, models interface, pinecone proccesing, LLM chain & PRODUCTION output


## API KEY - RUN THIS

In [None]:
!pip install python-dotenv
!pip install langchain-community
!pip install langchain-openai
!pip install langchain-community langchain-core
!pip install langchain-google-genai
!pip install pinecone
!pip install tiktoken

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [None]:
from dotenv import load_dotenv
from pprint import pprint
import getpass
import os

load_dotenv("env")

AZURE_OPENAI_API_KEY =  os.getenv("API_KEY")
EMBEDDINGS_DEPLOYMENT = os.getenv("EMBEDDING_DEPLOYMENT")
GPT_DEPLOYMENT = os.getenv("GPT_DEPLOYMENT")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
API_VERSION = os.getenv("API_VERSION")
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')

#GOOGLE_API_KEY = os.environ['GOOGLE_API_KEY']


#print(f"API Key: {AZURE_OPENAI_API_KEY}")
print(f"Embedding Deployment: {EMBEDDINGS_DEPLOYMENT}")
print(f"CHAT Deployment: {GPT_DEPLOYMENT}")
print(f"AZURE_OPENAI_ENDPOINT: {AZURE_OPENAI_ENDPOINT}")
print(f"API_VERSION: {API_VERSION}")

def debug(s, title=None):
    if title:
        print(f"[*] {title}")
    print(f"[*] {s}")

Embedding Deployment: team1-embedding
CHAT Deployment: team1-gpt4o
AZURE_OPENAI_ENDPOINT: https://096290-oai.openai.azure.com
API_VERSION: 2023-05-15


In [None]:
api_key = os.getenv("API_KEY") # Changed from 'AZURE_OPENAI_API_KEY' to 'API_KEY'
if not api_key:
    raise Exception(" API_KEY is missing! Please set it in your environment.") # Changed from 'AZURE_OPENAI_API_KEY' to 'API_KEY'
else:
    print("API Key is set and ready to use.")

API Key is set and ready to use.


## Initialize the model

In [None]:
from openai import OpenAI
from langchain.chat_models import AzureChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema import HumanMessage, SystemMessage

In [None]:
from openai import OpenAI, AzureOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from pinecone import Pinecone


AZURE_OPENAI_API_KEY = os.getenv('API_KEY')
EMBEDDINGS_DEPLOYMENT = "team1-embedding"
EMBEDDINGS_MODEL_NAME = "text-embedding-3-small"
AZURE_OPENAI_ENDPOINT = "https://096290-oai.openai.azure.com"
CHAT_DEPLOYMENT = "team1-gpt4o"
CHAT_API_VERSION = "2023-05-15"
EMBEDDINGS_MODEL_NAME = "text-embedding-3-small"
EMBEDDINGS_API_VERSION = "2024-08-01-preview"


client = AzureOpenAI(
    api_key = AZURE_OPENAI_API_KEY, # api key
    azure_endpoint = AZURE_OPENAI_ENDPOINT,
    api_version = "2024-08-01-preview",
    azure_deployment = EMBEDDINGS_DEPLOYMENT,
)

embedder_gpt_model = AzureOpenAIEmbeddings(
            model = EMBEDDINGS_MODEL_NAME,
            azure_endpoint = AZURE_OPENAI_ENDPOINT,
            azure_deployment = EMBEDDINGS_DEPLOYMENT,
            api_key = AZURE_OPENAI_API_KEY,
            api_version = EMBEDDINGS_API_VERSION,
            openai_api_type = "azure",
        )

chat = AzureChatOpenAI(
    azure_deployment=CHAT_DEPLOYMENT,
    api_key=AZURE_OPENAI_API_KEY,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    openai_api_version=CHAT_API_VERSION,
    openai_api_type="azure",
    temperature=0.7
)
pc = Pinecone(
    api_key=PINECONE_API_KEY
)

index = pc.Index("crave-agent")

## Define prompt template & chat

# User Input

The following class defines what the LLM should output given a user's input, in which we describe each property by defining a type and giving a description.

In [None]:
from pydantic import BaseModel, Field
from typing import List
import json
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
from typing import List


class UserInput(BaseModel):
    age: int = Field(description="User's age in years")
    weight: float = Field(description="User's weight in kg")
    height: float = Field(description="User's height in cm")
    gender: str = Field(description="User's gender (male/female)")
    activity_level: int = Field(description="User's activity level (1: sedentary, 2: light, 3: moderate, 4: active, 5: very active)")
    meal_type: str = Field(description="Type of meal user wants to eat (breakfast, lunch, dinner, snack)")
    food_category: str = Field(description="Category of the food (e.g., Mexican, Italian, etc.)")
    cravings: str = Field(description="The type of craving (e.g., sweet, salty, etc.)")
    included_ingredients: List[str] = Field(description="Ingredients available to cook")
    calorie_intake: int = Field(description="Calculated calorie intake for this meal based on user's data")
    #response_format: str = Field(description="The format of the response")

# Initialize the output parser with the updated schema
parser = PydanticOutputParser(pydantic_object=UserInput)

## Feature Engineering - BMI & Calorie Helper function

Using the user's features such as weight, age, gender and meal, we generate features like BMI, TDEE and then decide on a specific calorie intake that will help us later when fetching meals

In [None]:
def calculate_bmi(weight_kg, height_cm):
    """Calculate BMI using the formula: weight (kg) / height (m)^2"""
    height_m = height_cm / 100  # Convert cm to meters
    bmi = weight_kg / (height_m ** 2)
    return round(bmi, 2)

def classify_bmi(bmi):
    """Classify BMI into categories"""
    if bmi < 18.5:
        return "Underweight"
    elif 18.5 <= bmi < 25:
        return "Normal weight"
    elif 25 <= bmi < 30:
        return "Overweight"
    else:
        return "Obese"

def calculate_tdee(weight_kg, height_cm, age, gender, activity_level):
    """Calculate TDEE (Total Daily Energy Expenditure) using Mifflin-St Jeor Equation"""
    if gender.lower() == "male":
        bmr = (10 * weight_kg) + (6.25 * height_cm) - (5 * age) + 5
    else:
        bmr = (10 * weight_kg) + (6.25 * height_cm) - (5 * age) - 161

    # Activity level mapping
    activity_multipliers = {
        1: 1.2,    # Sedentary
        2: 1.375,  # Light activity
        3: 1.55,   # Moderate activity
        4: 1.725,  # Active
        5: 1.9     # Very active
    }

    # Get multiplier (default to sedentary if out of range)
    multiplier = activity_multipliers.get(activity_level, 1.2)

    # Calculate TDEE
    return int(bmr * multiplier)

def adjust_calories_based_on_bmi(calories, bmi_class):
    """Modify calorie intake based on BMI classification"""
    bmi_modifiers = {
        "Underweight": 1.15,  # Increase by 15%
        "Normal weight": 1.0,  # No change
        "Overweight": 0.9,  # Reduce by 10%
        "Obese": 0.8  # Reduce by 20%
    }
    return int(calories * bmi_modifiers.get(bmi_class, 1.0))  # Default to no change

def distribute_calories_per_meal(total_calories, meal_type):
    """Distribute total daily calories based on meal type"""
    meal_distribution = {
        "breakfast": 0.3,  # 30% of daily intake
        "lunch": 0.4,      # 40%
        "dinner": 0.25,    # 25%
        "snack": 0.15      # 15%
    }
    return int(total_calories * meal_distribution.get(meal_type.lower(), 0.3))  # Default to 30%

### Example
Given an already parsed user input, the following cell shows how we extract and create additional features

In [None]:
# Example Parsed User Input
user_profile = {
    "age": 25,
    "weight": 77,
    "height": 160,
    "calorie_limit": 600,
    "gender": "female",
    "activity_level": 3,
    "meal_type": "breakfast"
}

# Step 1: Calculate BMI
bmi = calculate_bmi(user_profile["weight"], user_profile["height"])
bmi_class = classify_bmi(bmi)

# Step 2: Calculate Daily Calories (TDEE)
tdee = calculate_tdee(
    user_profile["weight"],
    user_profile["height"],
    user_profile["age"],
    user_profile["gender"],
    user_profile["activity_level"]
)

# Step 3: Modify Calories Based on BMI Class
adjusted_calories = adjust_calories_based_on_bmi(tdee, bmi_class)

# Step 4: Get Meal-Specific Calories
meal_calories = distribute_calories_per_meal(adjusted_calories, user_profile["meal_type"])

# Display Results
print(f" User BMI: {bmi} ({bmi_class})")
print(f" Daily Calorie Needs (TDEE): {tdee} kcal")
print(f" Adjusted Calories (After BMI Modifier): {adjusted_calories} kcal")
print(f" Recommended Calories for {user_profile['meal_type'].capitalize()}: {meal_calories} kcal")

 User BMI: 30.08 (Obese)
 Daily Calorie Needs (TDEE): 2300 kcal
 Adjusted Calories (After BMI Modifier): 1840 kcal
 Recommended Calories for Breakfast: 552 kcal


## Raw example from user input to features

Function that takes in the user's raw input and converts it JSON format and then adds features engineered as specified above

In [None]:
def generate_struct(user_text, _debug=False):
    prompt_template = """
    Extract and structure the following user text into JSON format according to the provided schema.

    Follow this **exact JSON schema**:
    {format_instructions}

    Only output JSON, without explanations or extra text.

    User Text: {user_text}
    """

    instructions = parser.get_format_instructions()

    prompt = PromptTemplate(
        input_variables=["user_text"],
        partial_variables={"format_instructions": instructions},
        template=prompt_template
    )

    formatted_prompt = prompt.format(user_text=user_text)

    # Call the LLM
    llm_output = chat.invoke(formatted_prompt)

    # Extract raw text from LLM output
    json_string = llm_output.content  # Extract JSON text

    # Remove unwanted Markdown formatting (```json ... ```)
    json_string = json_string.strip("```json").strip("```").strip()

    try:
        # Convert cleaned JSON string to Python dict
        json_output = json.loads(json_string)

        # Extract user profile details for calorie calculations
        profile = json_output
        weight = profile["weight"]
        height = profile["height"]
        age = profile["age"]
        gender = profile["gender"]
        activity_level = profile["activity_level"]
        meal_type = profile["meal_type"]
        food_category = profile["food_category"]
        cravings = profile["cravings"]
        included_ingredients = profile["included_ingredients"]

        # Calculate calories
        bmi = calculate_bmi(weight, height)
        bmi_class = classify_bmi(bmi)
        tdee = calculate_tdee(weight, height, age, gender, activity_level)
        adjusted_calories = adjust_calories_based_on_bmi(tdee, bmi_class)
        meal_calories = distribute_calories_per_meal(adjusted_calories, meal_type)

        # Add calculated calories to output
        json_output["calorie_intake"] = meal_calories
        #json_output["response_format"] = "response_format"  # Placeholder for response_format without quotes

        return json_output

    except json.JSONDecodeError:
        raise Exception("LLM output is not valid JSON.")
    except Exception as e:
        raise Exception(f"Parsing error: {str(e)}")

In [None]:
user_text = """
I'm Nitzan 28 years old, weight 65kg, and I'm 170cm tall.
I'm male and I work out 6 times a week.
I want a light snack with chocolate.
I have strawberries and yogurt and bannanas in my fridge.
"""
parsed_output = generate_struct(user_text, _debug=True)
parsed_output

{'age': 28,
 'weight': 65,
 'height': 170,
 'gender': 'male',
 'activity_level': 4,
 'meal_type': 'snack',
 'food_category': 'chocolate',
 'cravings': 'sweet',
 'included_ingredients': ['strawberries', 'yogurt', 'bannanas'],
 'calorie_intake': 408}

## LLM Chain

### Imports

# VectorDB

## User Prompt and VectorDB

### Defining the Query Chain Function

The following cell defines the function `build_query_chain`, which builds a chain that generates a detailed query string that will be used to query the vectorDB, and includes the following:

1. **Response Schema Definition:**  
   - A `ResponseSchema` named `query_string` is defined to describe the expected output—a detailed query string for a similarity search in a vector database of healthy recipes.

2. **Prompt Template Setup:**  
   - A `PromptTemplate` is created with a detailed multi-line string.  
   - The template instructs the language model to act as an expert chef, recipe creator, and nutritionist, taking into account various user preferences (e.g., food category, required ingredients, user profile details like age, weight, height, etc.).
   - It clearly specifies the response format using the output parser instructions.

In [None]:
from openai import OpenAI
from langchain.chat_models import AzureChatOpenAI
from dotenv import load_dotenv
# from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_openai import AzureOpenAIEmbeddings
import os
from langchain.prompts import PromptTemplate
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain.chains import LLMChain, SequentialChain
from langchain.chat_models import AzureChatOpenAI

def build_query_chain(llm):

    query_string = ResponseSchema(
        name="query_string",
        description="A detailed and specific query string that reflects the user's needs, suitable for a similarity search on a vector database of healthy recipes"
    )

    output_parser = StructuredOutputParser.from_response_schemas(
        [query_string]
        )

    response_format = output_parser.get_format_instructions()

    prompt = PromptTemplate.from_template(
        template="""
        You are an expert chef, a professional recipe creator, and a certified nutritionist.
        Your task is to suggest the best recipe that fits the following user preferences,
        ensuring that the recipe is not only delicious but also healthy for the user.

        Carefully analyze and consider the following details:

        'Food Category': {food_category}  # Type of cuisine or food group (e.g., vegan, Mediterranean, low-carb)
        'Required Ingredients': {included_ingredients}  # The ingredients that must be included in the recipe (e.g., spinach, quinoa)

        **User Profile:**
        'Age': {age}  # User's age
        'Weight': {weight}  # User's weight (in kg)
        'Height': {height}  # User's height (in cm)
        'Gender': {gender}  # User's gender (male/female)
        'Activity Level': {activity_level}  # How active the user is (1: sedentary, 2: light, 3: moderate, 4: active, 5: very active)
        'Meal Type': {meal_type}  # The type of meal (breakfast, lunch, dinner, snack)
        'Cravings': {cravings}  # Any specific craving (e.g., sweet, savory, crunchy)

        As a professional nutritionist, ensure the recipe aligns with the user's health goals.
        If the user seeks to maintain or lose weight, gain muscle, or manage a specific condition,
        provide a recipe that helps meet those goals while maintaining proper nutrition and balanced calories.

        Your job is to generate a **clear and specific** query string based on these preferences to perform a similarity search
        in a vector database of recipes.

        Be sure to:
        - Focus on the key ingredients, food type, and any health requirements.
        - Suggest meals that are nutritionally balanced, considering the user's weight, age, and activity level.
        - Ensure the recipe falls within the user's required **calorie range**.
        - Use clear language that directly addresses the food needs and health considerations of the user.

        **Response format**: {response_format}
        """
    )

    query_chain = LLMChain(llm=llm, prompt=prompt, output_key='query')

    chain = SequentialChain(
        chains=[query_chain],
        input_variables=[
            'food_category', 'included_ingredients', 'height',
            'age', 'weight', 'gender', 'activity_level', 'meal_type', 'cravings', 'calorie_intake', 'response_format'
            ],
        output_variables=['query'],
        verbose=False
    )

    return chain, response_format, output_parser

### Generate Query & Similarity Search

- **Parse User Input:**  
  Uses `generate_struct` to turn raw text into structured data.

- **Build & Run Query Chain:**  
  Creates a query chain with `build_query_chain(chat)`, adds the response format, and runs it to generate a query string (and extract `calorie_intake`).

- **Feature Engineering:**  
  Uses the previous code that calculates the calorie intake for user given their features, it then sets upper (10% above) and lower (60% below) calorie bounds, this allows the search to be wider as its searching a range and not for a specific number

- **Embed & Search:**  
  Converts the query to an embedding with `embedder_gpt_model.embed_query(query)` and queries the Pinecone index with a calorie filter to fetch the top 20 similar recipes.

In [None]:
import json

# Function to generate query and perform similarity search
def generate_query(user_text, _debug=True, chat=None, embedder_gpt_model=None):
    # Generate structured output from the user text
    parsed_output = generate_struct(user_text, _debug=_debug) # Calorie intake is calcualted here

    # Use the chain and response format to generate a query
    chain, response_format, output_parser = build_query_chain(chat)
    parsed_output['response_format']=response_format

    # Running the chain to generate the query
    query_output = chain.run(**parsed_output)
    query = query_output[27:-7]  # Extract the query part
    calorie_intake = parsed_output['calorie_intake']

    # Define calorie range based on input query
    calorie_range_max = int(parsed_output['calorie_intake'] * 1.1)  # 10% higher
    calorie_range_min = int(parsed_output['calorie_intake'] * 0.6)  # 60% lower

    # Convert query into embeddings
    query_embedding = embedder_gpt_model.embed_query(query)  # Function to convert the query into vector representation

    # Perform the similarity search in Pinecone
    response = index.query(
        namespace="embds+calo",  # Assuming namespace is calo
        vector=query_embedding,  # The embedding of the user input
        top_k=20,
        include_values=True,
        include_metadata=True,
        filter={
            "calories": {
                "$lte": calorie_range_max,  # Less than or equal to max calories
                "$gte": calorie_range_min  # Greater than or equal to min calories
            }
        }
    )

    return response, calorie_intake

# User Friendly Display
The following code is used to create user friendly display of the results and can be ignored.

In [None]:
import pandas as pd
from IPython.display import display, HTML
import matplotlib.pyplot as plt
from PIL import Image as PILImage
import requests
from io import BytesIO
import math
import uuid
import re
import ast

def parse_recipe_dict(recipe_dict):
    """
    Parse and transform the recipe dictionary into the format expected by the display functions.

    Parameters:
    recipe_dict (dict): Raw recipe dictionary from the dataset

    Returns:
    dict: Transformed recipe dictionary ready for display
    """
    # Create a new dictionary with the expected structure
    transformed = {}

    # Basic info
    transformed['Name'] = recipe_dict.get('Name', 'Unnamed Recipe')
    transformed['Author'] = recipe_dict.get('AuthorName', 'Unknown')
    transformed['category'] = recipe_dict.get('RecipeCategory', 'Uncategorized')
    transformed['rating'] = recipe_dict.get('AggregatedRating', 0)
    transformed['reviews_count'] = int(recipe_dict.get('ReviewCount', 0))

    # Parse time information
    cook_time = recipe_dict.get('CookTime', '')
    prep_time = recipe_dict.get('PrepTime', '')
    total_time = recipe_dict.get('TotalTime', '')

    # Use total time if available, otherwise calculate from cook and prep time
    if total_time and isinstance(total_time, str):
        time_str = parse_iso_duration(total_time)
    elif prep_time and isinstance(prep_time, str):
        time_str = parse_iso_duration(prep_time)
    else:
        time_str = "00:00:00"  # Default time format

    transformed['time'] = time_str

    # Nutrition information
    transformed['calories'] = recipe_dict.get('Calories', 0)
    transformed['servings'] = recipe_dict.get('RecipeServings', 1)

    # Calculate nutrition percentages
    total_nutrients = (
        recipe_dict.get('CarbohydrateContent', 0) +
        recipe_dict.get('ProteinContent', 0) +
        recipe_dict.get('FatContent', 0)
    )

    if total_nutrients > 0:
        transformed['carbohydrates percentage'] = (recipe_dict.get('CarbohydrateContent', 0) / total_nutrients) * 100
        transformed['proteins percentage'] = (recipe_dict.get('ProteinContent', 0) / total_nutrients) * 100
        transformed['fat percentage'] = (recipe_dict.get('FatContent', 0) / total_nutrients) * 100
    else:
        transformed['carbohydrates percentage'] = 0
        transformed['proteins percentage'] = 0
        transformed['fat percentage'] = 0

    # Parse ingredients
    ingredients = {}
    quantities = safe_eval_list(recipe_dict.get('RecipeIngredientQuantities', '[]'))
    parts = safe_eval_list(recipe_dict.get('RecipeIngredientParts', '[]'))

    # Ensure quantities and parts have the same length
    min_length = min(len(quantities), len(parts))
    for i in range(min_length):
        ingredients[parts[i]] = quantities[i] if i < len(quantities) else ""

    transformed['ingredients'] = ingredients

    # Parse instructions
    instructions_list = safe_eval_list(recipe_dict.get('RecipeInstructions', '[]'))
    transformed['instructions'] = "\n".join(instructions_list)

    # Parse keywords
    transformed['keywords'] = safe_eval_list(recipe_dict.get('Keywords', '[]'))

    # Parse images
    images = recipe_dict.get('Images', 'character(0)')
    if images == 'character(0)':
        transformed['images'] = []
    else:
        # If we have actual image URLs, parse them
        transformed['images'] = safe_eval_list(images)

    return transformed

def safe_eval_list(list_str):
    """
    Safely evaluate a string representation of a list.

    Parameters:
    list_str (str): String representation of a list (e.g., "c('item1', 'item2')")

    Returns:
    list: Parsed list, or empty list if parsing fails
    """
    try:
        # Handle R-style c() lists
        if isinstance(list_str, str) and list_str.startswith('c('):
            # Extract content inside c() and parse it
            content = list_str[2:-1]
            # Split by commas, but not if the comma is inside quotes
            items = re.findall(r'"([^"]*)"|\'([^\']*)\'', content)
            # Flatten the list of tuples and filter out empty strings
            return [item for sublist in items for item in sublist if item]
        elif isinstance(list_str, list):
            return list_str
        else:
            # Try using ast.literal_eval for other list formats
            return ast.literal_eval(list_str)
    except (SyntaxError, ValueError):
        # Return empty list if parsing fails
        return []

def parse_iso_duration(iso_duration):
    """
    Parse ISO 8601 duration format (e.g., PT10M) to "HH:MM:SS" format.

    Parameters:
    iso_duration (str): Duration in ISO 8601 format

    Returns:
    str: Duration in "HH:MM:SS" format
    """
    hours, minutes, seconds = 0, 0, 0

    # Check if string is in ISO 8601 format
    if isinstance(iso_duration, str) and iso_duration.startswith('PT'):
        # Extract hours, minutes, seconds
        if 'H' in iso_duration:
            h_parts = iso_duration.split('H')
            hours = int(h_parts[0].replace('PT', ''))
            iso_duration = h_parts[1]

        if 'M' in iso_duration:
            m_parts = iso_duration.split('M')
            minutes = int(m_parts[0].replace('PT', ''))
            iso_duration = m_parts[1]

        if 'S' in iso_duration:
            seconds = int(iso_duration.split('S')[0].replace('PT', ''))

    return f"{hours:02d}:{minutes:02d}:{seconds:02d}"

def display_recipes_ranked(recipes_list, relevance_scores=None):
    """
    Display multiple recipes in a compact format, ranked by relevance.

    Parameters:
    recipes_list (list): List of recipe dictionaries
    relevance_scores (list, optional): List of relevance scores corresponding to recipes.
                                      If None, recipes are assumed to be pre-sorted.
    """
    # Transform all recipe dictionaries
    transformed_recipes = [parse_recipe_dict(recipe) for recipe in recipes_list]

    # Sort recipes by relevance if scores are provided
    if relevance_scores:
        # Pair recipes with their scores and sort
        recipe_pairs = list(zip(transformed_recipes, relevance_scores))
        recipe_pairs.sort(key=lambda x: x[1], reverse=True)
        sorted_recipes = [pair[0] for pair in recipe_pairs]
    else:
        # Assume recipes are already sorted
        sorted_recipes = transformed_recipes

    # Display each recipe
    for i, recipe in enumerate(sorted_recipes):
        display_single_recipe(recipe, i+1, len(sorted_recipes))

def display_single_recipe(recipe_dict, rank=None, total=None):
    """
    Display a single recipe in a compact format.

    Parameters:
    recipe_dict (dict): Recipe dictionary
    rank (int, optional): Rank of this recipe
    total (int, optional): Total number of recipes
    """
    # Check if the recipe dictionary needs transformation
    if 'RecipeId' in recipe_dict:
        recipe_dict = parse_recipe_dict(recipe_dict)

    # Generate a unique ID for this recipe
    recipe_id = f"recipe_{uuid.uuid4().hex[:8]}"

    # Create HTML for recipe header with rank if provided
    rank_display = f"<span style='color: #888; font-size: 0.9em;'>Rank {rank}/{total}</span>" if rank else ""

    header_html = f"""
    <div style="font-family: Arial; padding: 10px; border-radius: 5px; border: 1px solid #ddd; margin-bottom: 10px;">
        <div style="display: flex; justify-content: space-between; align-items: center;">
            <h2 style="color: #333; margin: 0 0 5px 0;">{recipe_dict['Name']}</h2>
            {rank_display}
        </div>
        <div style="display: flex; justify-content: space-between; align-items: center;">
            <span style="color: #666; font-size: 0.9em;">By {recipe_dict['Author']} | {recipe_dict['category']}</span>
            <span style="font-size: 1em; color: #f6ab00;">{'★' * int(recipe_dict['rating'])} ({recipe_dict['reviews_count']})</span>
        </div>
    </div>
    """
    display(HTML(header_html))

    # Display images in a horizontal scrollable container
    if recipe_dict['images'] and len(recipe_dict['images']) > 0:
        image_container = f"""
        <div style="margin: 10px 0; max-width: 100%; overflow-x: auto;">
            <div style="display: flex; gap: 5px;">
        """

        for img_url in recipe_dict['images']:
            image_container += f"""
            <div style="flex: 0 0 auto; width: 150px; height: 100px;">
                <img src="{img_url}" style="width: 100%; height: 100%; object-fit: cover; border-radius: 3px;">
            </div>
            """

        image_container += """
            </div>
        </div>
        """

        display(HTML(image_container))

    # Create a compact info section
    info_html = "<div style='display: flex; flex-wrap: wrap; gap: 10px; margin: 10px 0;'>"
    info_html += f"<div style='padding: 5px 10px; background-color: #f0f8ff; border-radius: 4px;'><b>Time:</b> {format_time(recipe_dict['time'])}</div>"
    info_html += f"<div style='padding: 5px 10px; background-color: #f0f8ff; border-radius: 4px;'><b>Calories:</b> {recipe_dict['calories']} kcal</div>"
    info_html += f"<div style='padding: 5px 10px; background-color: #f0f8ff; border-radius: 4px;'><b>Servings:</b> {recipe_dict['servings']}</div>"

    # Add nutrition percentages (rounded)
    info_html += f"<div style='padding: 5px 10px; background-color: #f0fff0; border-radius: 4px;'><b>Carbs:</b> {round(recipe_dict.get('carbohydrates percentage', 0), 1)}%</div>"
    info_html += f"<div style='padding: 5px 10px; background-color: #f0fff0; border-radius: 4px;'><b>Protein:</b> {round(recipe_dict.get('proteins percentage', 0), 1)}%</div>"
    info_html += f"<div style='padding: 5px 10px; background-color: #f0fff0; border-radius: 4px;'><b>Fat:</b> {round(recipe_dict.get('fat percentage', 0), 1)}%</div>"
    info_html += "</div>"
    display(HTML(info_html))

    # Display ingredients and instructions stacked vertically
    content_html = """
    <div style='font-family: Arial; margin: 10px 0;'>
        <h4 style='margin-top: 0; margin-bottom: 5px;'>Ingredients</h4>
        <ul style='margin: 0 0 10px 0; padding-left: 20px;'>
    """

    for ingredient, amount in recipe_dict.get('ingredients', {}).items():
        content_html += f"<li><b>{ingredient}:</b> {amount}</li>"

    content_html += """
        </ul>
        <h4 style='margin-top: 10px; margin-bottom: 5px;'>Instructions</h4>
        <p style='margin: 0;'>{}</p>
    </div>
    """.format(recipe_dict.get('instructions', 'No instructions available.'))

    display(HTML(content_html))

    # Display tags if any
    if recipe_dict.get('keywords'):
        tags_html = "<div style='margin: 5px 0;'>"
        for keyword in recipe_dict['keywords']:
            tags_html += f"<span style='background-color: #eaeaea; padding: 2px 8px; border-radius: 10px; font-size: 0.8em; margin-right: 5px;'>{keyword}</span>"
        tags_html += "</div>"
        display(HTML(tags_html))

    # Add separator between recipes
    display(HTML("<hr style='margin: 20px 0; border: 0; border-top: 1px dashed #ccc;'>"))

def format_time(time_str):
    """Format time string from "00:5:00" to "5m" format"""
    try:
        parts = time_str.split(':')
        result = []
        if int(parts[0]) > 0:
            result.append(f"{int(parts[0])}h")
        if int(parts[1]) > 0:
            result.append(f"{int(parts[1])}m")
        if not result:
            return "< 1m"
        return "".join(result)
    except:
        return time_str  # Return original if parsing fails

# Chaining everything together

In [None]:
user_text="""
I'm Emily 33 years old, weight 50 kg, and I'm 170cm tall.
I'm female and I work out 6 times a week.
I want a heavy dinner with bagel with bacon, egg and cheese.
I have ham and bagel and tomato in my fridge.
"""
response, calorie_intake = generate_query(user_text, _debug=True, chat=chat, embedder_gpt_model=embedder_gpt_model)
print(f'Recommended Calorie intake - {calorie_intake}')


We embeded the reviews of each recipe, and we load them here

In [None]:
#  Load the saved RecipeIds
with open("embeddings_and_ids.json", "r") as f:
    embeddings_and_ids = json.load(f)

print(f" Loaded {len(embeddings_and_ids)} embedding_data")

 Loaded 11844 embedding_data


# Recipe Selection Process

After querying the user's input in our vector database of recipes, we retrieved the **20 most similar recipes**. Next, we refine this selection by ranking the recipes based on two factors:

1. **Similarity to the user's preferences** (based on vector embeddings).  
2. **The quality of reviews** (when available).  

## Handling Recipes with and without Reviews

- **For recipes with reviews:**  
  - Compute the **cosine similarity** between the recipe and its reviews.  
  - Take the **average** of the user-recipe similarity and the recipe-review similarity to get a final ranking score.  

- **For recipes without reviews:**  
  - These recipes should still be considered because they might be **highly relevant** to the user’s request.  
  - To account for missing review data, we compute the **average of the user-recipe similarity** with the **mean of all recipe-review similarity scores** from the top 20 recipes.  

## Final Ranking and Selection

With these final scores, we **rank all 20 recipes** and select the **top 3 recipes** to present to the user. This ensures a **balanced evaluation**, incorporating both the **similarity to user preferences** and **review-based quality** when available.

In [None]:
top_20_recipes = response['matches']
top_20_recipe_embeddings = [match['values'] for match in top_20_recipes]  # Recipe embeddings
top_20_recipe_ids = [match['id'] for match in top_20_recipes]  # Recipe IDs
top_scores = [(match['score'], match['id']) for match in top_20_recipes]  # Include both the score and RecipeId
# Get review embeddings for the top 20 recipes
matching_embeddings = {}

# Get review embeddings for the top 20 recipes
for search_recipe_id in top_20_recipe_ids:
    for review in embeddings_and_ids:
        if review['RecipeId'] == int(search_recipe_id):
            matching_embeddings[search_recipe_id] = review['Embedding']  # Store the review embedding
            break  # Stop once the matching RecipeId is found
# Check if embeddings were found and print the results
if matching_embeddings:
    print(f"Found embeddings for {len(matching_embeddings)} RecipeIds: {list(matching_embeddings.keys())}")
else:
    print("No embeddings found for the top 20 RecipeIds.")

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def compute_final_similarities(top_20_recipes, top_20_recipe_embeddings, matching_embeddings, top_scores):
    similarity_scores = []

    # Step 1: Calculate avg_sim based on cosine similarity between reviews and recipes
    review_similarities = []
    for match in top_20_recipes:
        recipe_id = match['id']  # Extract the recipe_id from the match object
        if recipe_id in matching_embeddings:  # Recipe with reviews
            review_embedding = matching_embeddings[recipe_id]  # Get the corresponding review embedding
            recipe_embedding = top_20_recipe_embeddings[top_20_recipe_ids.index(recipe_id)]  # Get the recipe embedding by matching the RecipeId

            if recipe_embedding is not None:
                # Calculate cosine similarity between the recipe and the review
                cosine_sim = cosine_similarity([recipe_embedding], [review_embedding])[0][0]
                review_similarities.append(cosine_sim)

    # Calculate the average cosine similarity for the reviews
    avg_sim = np.mean(review_similarities) if review_similarities else 0  # If no reviews, set avg_sim to 0

    # Step 2: Iterate through the recipes and compute the final similarity
    for i, match in enumerate(top_20_recipes):
        recipe_id = match['id']
        if recipe_id in matching_embeddings:  # Recipe with reviews
            review_embedding = matching_embeddings[recipe_id]  # Get the corresponding review embedding
            recipe_embedding = top_20_recipe_embeddings[i]  # Get the recipe embedding

            if recipe_embedding is not None:
                # Calculate cosine similarity between the recipe and the review
                cosine_sim = cosine_similarity([recipe_embedding], [review_embedding])[0][0]

                # Compute the average similarity (original similarity + review similarity)
                combined_similarity = (top_scores[i][0] + cosine_sim) / 2
        else:  # Recipe without reviews
            combined_similarity = (top_scores[i][0] + avg_sim) / 2  # Combine original similarity with average similarity

        # Store the final similarity and corresponding recipe id
        similarity_scores.append((recipe_id, combined_similarity))

    # Step 3: Sort the recipes based on the final combined similarity score in descending order
    similarity_scores.sort(key=lambda x: x[1], reverse=True)

    # Step 4: Get the top 3 recipes
    top_3_recipes = similarity_scores[:3]

    # Return the top 3 recipes
    top_3_recipe_ids = [recipe_id for recipe_id, score in top_3_recipes]

    return top_3_recipe_ids


# Call the function to get the top 3 recipes based on the final similarity
top_3_recipe_ids = compute_final_similarities(top_20_recipes, top_20_recipe_embeddings, matching_embeddings, top_scores)
vector_ids = [int(recipe_id) for recipe_id in top_3_recipe_ids]

print(f"Top 3 Recipe IDs: {vector_ids}")

Top 3 Recipe IDs: [198967, 47038, 397232]


Now that we have the best 3 recipes we show them to the user!

In [None]:
import pandas as pd
import numpy as np
import kagglehub
import re
import os
# Download latest version
path = kagglehub.dataset_download("irkaal/foodcom-recipes-and-reviews")
csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
recipes_file_path = os.path.join(path, csv_files[1])
reviews_file_path = os.path.join(path, csv_files[0])
df = pd.read_csv(recipes_file_path)

In [None]:
records = df[df["RecipeId"].isin(vector_ids)].to_dict(orient="records")
display_recipes_ranked(records)