In [1]:
import chromadb
from chromadb.config import Settings
import os
import pandas as pd
import numpy as np
import json
import time
from langchain_groq import ChatGroq
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from IPython.display import display, Markdown
from typing import Annotated
from typing_extensions import TypedDict
from langchain_community.utilities import WikipediaAPIWrapper
from langchain_community.tools import WikipediaQueryRun
from IPython.display import Markdown, JSON
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.output_parsers import StrOutputParser
from langchain.schema import Document
from langgraph.graph import END, StateGraph
from langchain_chroma import Chroma
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key= os.environ["HUGGINGFACE_API_KEY"], model_name="BAAI/bge-small-en-v1.5"
)

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("irkaal/foodcom-recipes-and-reviews", force_download=True)

print("Path to dataset files:", path)

In [None]:
df = pd.read_csv(path+'/recipes.csv')

In [None]:
categories = [
    'Frozen Desserts', 'Chicken Breast', 'Beverages', 'Soy/Tofu',
    'Vegetable', 'Pie', 'Chicken', 'Dessert', 'Sauces', 'Stew',
    'Black Beans', '< 60 Mins', 'Lactose Free', 'Weeknight',
    'Yeast Breads', 'Whole Chicken', 'High Protein', 'Cheesecake',
    'Free Of...', 'High In...', 'Breakfast', 'Breads', 'Bar Cookie', 'Asian', 'Potato', 'Cheese', 'Meat', 'Very Low Carbs',
    'Spaghetti', 'Lunch/Snacks', 'Beans', 'Quick Breads',
    'Poultry', 'Healthy', 'Rice', 'Apple', 'Broil/Grill', 'Tex Mex',
    'German', 'Fruit', 'European', 'Smoothies', 'Greek', 'Corn',
    'Lentil', 'Spanish', 'Tuna', 'Citrus', 'Peppers', 'Salad Dressings',
    'Mexican', '< 15 Mins', 'One Dish Meal', 'Spicy', 'Thai', 'Cajun',
    'Oven', 'Microwave', 'Vegan', 'For Large Groups', 'Chinese', 'Grains',
    'Yam/Sweet Potato', 'Winter', 'African', 'Pasta Shells', 'Stocks',
    'Meatballs', 'Spring', 'Wild Game', 'Collard Greens',
    'Tilapia', 'Moroccan', 'Pressure Cooker', 'Korean', 'Spinach',
    'Kosher', 'Australian', 'Peanut Butter', 'Sweet',
    'Nuts', 'Filipino', 'Brunch', 'South American',
    'Beginner Cook', 'Egg Free', 'Dairy Free Foods',
    'Avocado', 'Pakistani', 'Ice Cream', 'Snacks Sweet', 'Main Dish Casseroles',
    'Pot Roast', 'Soups Crock Pot', 'Indian', 'Breakfast Casseroles',
    'Macaroni And Cheese', 'Mashed Potatoes', 'Desserts Fruit',
    'Pumpkin', 'Baking', 'Chicken Thigh & Leg',
    'Broccoli Soup', 'Apple Pie', 'Artichoke', 'From Scratch', 'Vegetable',
    'Lunch/Snacks', 'Potluck', 'Camping', 'Chicken Crock Pot', 'Peanut Butter Pie', 'Ice Cream',
    'Hawaiian', 'Vegan', 'Gluten Free Appetizers', 'Chocolate Chip Cookies',
    'Danish', 'Creole', 'Cajun', 'Colombian', 'Italian', 'Roast',
    'Breakfast Eggs', 'Soups Crock Pot', 'Fish Salmon', 'Snacks Sweet',
    'Bread Pudding', 'Inexpensive', 'Bread Machine', 'Mexican',
    'Pot Pie', 'Seafood', 'Oysters', 'Nigerian', 'Chard', 'Avocado',
    'Margarita', 'Hunan', 'Peruvian', 'Camping',
    'Turkey Gravy', 'Freezer', 'High Fiber', 'Healthy',
    'Kid Friendly','Moroccan'
]

unique_categories = list(set(categories))

In [4]:
from chromadb import Documents, EmbeddingFunction, Embeddings


In [5]:
class MyEmbeddingFunction(EmbeddingFunction):
    def __init__(self, model: SentenceTransformer):
        self.model = model
    def embed_documents(self, input: Documents) -> Embeddings:
        sentences = input

        embeddings = self.model.encode(sentences).tolist()

        return embeddings

    def embed_query(self, text: str) -> list:
        # Get embedding for a single query
        return self.model.encode(text, convert_to_tensor=True).tolist()


In [None]:
model = SentenceTransformer('BAAI/bge-small-en-v1.5')
custom_embeddings=MyEmbeddingFunction(model)
vector_store = Chroma(
    collection_name="recipes",
    embedding_function=custom_embeddings,
    persist_directory="./database/",
    )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
docs = []
ids_recipes = []
for i, row in df.iloc[:100000].iterrows():
  if row['RecipeCategory'] in unique_categories and row['Name'] is not None and row['RecipeId'] is not None:
    params={
            "description": row['Description'],
            "IngredientQuantities": row['RecipeIngredientQuantities'],
            "IngredientParts": row['RecipeIngredientParts'],
            "preparation_method": row['RecipeInstructions'],
            "Calories": row['Calories'],
            "FatContent": row['FatContent'],
            "SaturatedFatContent": row['SaturatedFatContent'],
            "CholesterolContent": row['CholesterolContent'],
            "SodiumContent": row['SodiumContent'],
            "CarbohydrateContent": row['CarbohydrateContent'],
            "FiberContent": row['FiberContent'],
            "SugarContent": row['SugarContent'],
            "ProteinContent": row['ProteinContent'],
            "RecipeCategory": row['RecipeCategory']
        }

    docs.append(Document(page_content=row['Name'], metadata=params))
    ids_recipes.append(str(row['RecipeId']))

The yield keyword is used to return a list of values from a function.

Unlike the return keyword which stops further execution of the function, the yield keyword continues to the end of the function

In [None]:
MAX_BATCH_SIZE = 40000

# Split the docs and ids into smaller batches
def batch_documents(docs, ids, max_batch_size):
    for i in range(41666, len(docs), max_batch_size):
        yield docs[i:i + max_batch_size], ids[i:i + max_batch_size]

for batch_docs, batch_ids in batch_documents(docs, ids_recipes, MAX_BATCH_SIZE):
    vector_store.add_documents(documents=batch_docs, ids=batch_ids)


# Test

In [6]:
class MyEmbeddingFunction(EmbeddingFunction):
    def __init__(self, model: SentenceTransformer):
        self.model = model

    def embed_query(self, text: str) -> list:
        # Get embedding for a single query
        return self.model.encode(text, convert_to_tensor=True).tolist()

In [7]:
model = SentenceTransformer('BAAI/bge-small-en-v1.5')
custom_embeddings=MyEmbeddingFunction(model)

vector_store = Chroma(
    collection_name="recipes",
    embedding_function=custom_embeddings,
    persist_directory="./databases/",
    )

In [17]:
retriever = vector_store.as_retriever(search_kwargs={"k": 3})
result = retriever.invoke("Pizza Bread")

In [18]:
result

[Document(metadata={'Calories': 970.3, 'CarbohydrateContent': 163.2, 'CholesterolContent': 46.8, 'FatContent': 18.0, 'FiberContent': 9.2, 'IngredientParts': 'c("warm water", "granulated sugar", "kosher salt", "unbleached all-purpose flour", "unbleached flour", "flour", "olive oil", "mozzarella cheese", "fresh oregano", "fresh basil")', 'IngredientQuantities': 'c("2 1/4", "1", "2", "14", "3", "3", NA, NA, "2", "2 -3", "2 -4")', 'ProteinContent': 34.9, 'RecipeCategory': 'Breads', 'SaturatedFatContent': 8.6, 'SodiumContent': 2243.4, 'SugarContent': 12.5, 'description': 'This originally came from the Pleasure Bar and Restaurant in Pittsburgh, PA, perhaps as early as the 1960\'s but definately during the 1970\'s. Located in the heart of the Bloomfield section of Pittsburgh, this Italian restaurant and bar was known (at least to me) as a local hangout for the Pittsburgh Penguins. But in the years I frequented it as a student at Pitt as well as hockey fan, I never saw any of the players there

In [19]:
result[1].metadata

{'Calories': 519.5,
 'CarbohydrateContent': 50.0,
 'CholesterolContent': 81.3,
 'FatContent': 23.2,
 'FiberContent': 4.6,
 'IngredientParts': 'c("lean ground beef", "garlic", "red pepper flakes", "Italian-style stewed tomatoes", "sweet whole kernel corn", "cheddar cheese", "black olives", "green onion")',
 'IngredientQuantities': 'c("1/2", "1", "1/8", "1", "1", "1", "1 1/2", "1/4", "1/4")',
 'ProteinContent': 29.8,
 'RecipeCategory': '< 60 Mins',
 'SaturatedFatContent': 11.8,
 'SodiumContent': 1109.4,
 'SugarContent': 7.8,
 'description': 'Make and share this Pizza Bread recipe from Food.com.',
 'preparation_method': 'c("In skillet, brown meat with garlic and red pepper.", "Salt and pepper to taste; drain.", "Add tomatoes; cook, uncovered, over medium-high heat about 6 minutes or until thickened.", "Stir in corn.", "Spread over bread; top with cheese, olives and green onions.", "Bake for 8-10 minutes at 400 degrees.")'}

In [20]:
result[0].metadata

{'Calories': 970.3,
 'CarbohydrateContent': 163.2,
 'CholesterolContent': 46.8,
 'FatContent': 18.0,
 'FiberContent': 9.2,
 'IngredientParts': 'c("warm water", "granulated sugar", "kosher salt", "unbleached all-purpose flour", "unbleached flour", "flour", "olive oil", "mozzarella cheese", "fresh oregano", "fresh basil")',
 'IngredientQuantities': 'c("2 1/4", "1", "2", "14", "3", "3", NA, NA, "2", "2 -3", "2 -4")',
 'ProteinContent': 34.9,
 'RecipeCategory': 'Breads',
 'SaturatedFatContent': 8.6,
 'SodiumContent': 2243.4,
 'SugarContent': 12.5,
 'description': 'This originally came from the Pleasure Bar and Restaurant in Pittsburgh, PA, perhaps as early as the 1960\'s but definately during the 1970\'s. Located in the heart of the Bloomfield section of Pittsburgh, this Italian restaurant and bar was known (at least to me) as a local hangout for the Pittsburgh Penguins. But in the years I frequented it as a student at Pitt as well as hockey fan, I never saw any of the players there but di

In [21]:
import re

# Sample input
text = "jjdjdjj +dj jjd  name_recipe: pizza"

# Regex to capture the recipe name
pattern = r'name_recipe:\s*(\S.*)'

# Search for the pattern in the text
match = re.search(pattern, text)

if match:
    recipe_name = match.group(1)
    print(f"Recipe name: {recipe_name}")
else:
    print("No recipe found.")


Recipe name: pizza
