In [1]:
import pandas as pd 
import numpy as np 
from time import sleep
from tqdm import tqdm
from ast import literal_eval
import json

from rank_bm25 import BM25Okapi

import warnings
warnings.filterwarnings("ignore")


from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import DataFrameLoader
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.pydantic_v1 import BaseModel, Field, validator
from langchain.llms import OpenAI
from langchain.output_parsers import PydanticOutputParser

api_key = "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"
llm = ChatOpenAI(openai_api_key=api_key)

In [2]:
df = pd.read_csv('./total1.csv')

df['cleaned_ingredients'] = df['Ingredients'].apply(lambda x: x.lower().split(','))

# Tokenize: split each ingredient list into words
tokenized_ingredients = [doc for doc in df['cleaned_ingredients']]

# Create the BM25 object
bm25 = BM25Okapi(tokenized_ingredients)

def search_recipes_by_ingredients(ingredients):
    query = ingredients #.lower().split(',')
    scores = bm25.get_scores(query)
    top_indexes = np.argsort(scores)[::-1][:500]  # Adjust number of results as needed
    return df.iloc[top_indexes][['Recipe Title', 'Steps']]

# Example query: search for recipes with specific ingredients
search_results = search_recipes_by_ingredients("flour, sugar, eggs, cream cheese")
# print(search_results[['Recipe Title', 'Ingredients', 'Steps']])

In [3]:
embeddings = OpenAIEmbeddings(openai_api_key = api_key)

def obtain_retreiver(ingredient_list):
    ingredient_search_df = search_recipes_by_ingredients(ingredient_list)
    ingredient_search_df = ingredient_search_df.drop_duplicates()
    loader = DataFrameLoader(ingredient_search_df , page_content_column="Steps")
    data = loader.load() 
    
    db = FAISS.from_documents(data , embedding = embeddings)
    return db.as_retriever()



In [4]:
ingredient_list = "chicken, onion ,tomato , cashews , oil , raisins , bread , garlic"

retreiver = obtain_retreiver(ingredient_list)

In [5]:
#Sanity Check

retreiver.get_relevant_documents(ingredient_list)

[Document(page_content="['In a small bowl stir together the garlic paste, the oil, the cayenne, the ground coriander, and the cumin. Arrange the orange slices on a serving dish or 2 plates, drizzle them with the dressing, and sprinkle the salad with the fresh coriander.']", metadata={'Recipe Title': 'Spicy Orange Salad'}),
 Document(page_content='[\'Step 1\', \'Combine onion and 2 tablespoons vinegar in a small bowl. Season with salt and pepper; set aside.\', \'Step 2\', \'Heat 2 tablespoons olive oil in a large skillet over medium-high. Add bread; season with salt and pepper. Cook, tossing, until golden brown, 5–8 minutes. Transfer to a medium bowl. Wipe out skillet.\', \'Step 3\', \'Using a thin, sharp knife, cut bones and cartilage from chicken breasts. Pound chicken between 2 sheets of plastic wrap to 1/4" thick; season with salt and pepper.\', \'Step 4\', \'Heat 1 tablespoon vegetable oil in skillet over medium-high. Cook 1 chicken breast, skin side down, until golden brown and ne

In [50]:
llm = ChatOpenAI(temperature=0,model_name = 'gpt-4', openai_api_key = api_key)
compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=obtain_retreiver(ingredient_list))



In [55]:
rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm, chain_type='stuff',
    retriever=compression_retriever)
# rag_pipeline("Generate a recipe using the following ingredients chicken, onion ,tomato , cashews , oil , raisins , bread , garlic. Use only the ingredients that I have listed , and no other additional ingredients. You don't have to you use all the listed ingredients. Use the retreival step only to select steps from the vector store and use them as inspiration to create your own recipe. Give the recipe a name, and assume that we also have salt and some basic spices.  ")

In [107]:
eval_df = pd.read_csv('Ingredients.csv' )

In [62]:
eval_df.head()

Unnamed: 0,Ingredients
0,"['ready-rolled shortcrust pastry', 'flour', 'e..."
1,"['melon', 'cucumber', 'vine tomatoes', 'mint',..."
2,"['all-butter shortcrust pastry', 'smoked haddo..."
3,"['oil', 'baby new potato', 'mint']"
4,"['potatoes', 'smoked haddock', 'button mushroo..."


In [63]:
class recipe(BaseModel):
    recipe_title: str = Field(description="The title of the generated recipe")
    ingredients: list = Field(description="List of ingredients that the recipe uses")
    steps: list = Field(description="The steps to be followed inorder to make the recipe")


# Set up a parser + inject instructions into the prompt template.
parser = PydanticOutputParser(pydantic_object=recipe)

prompt = PromptTemplate(template="Generate a recipe using the following ingredients {query} . Use no other additional ingredients. You don't need to you use all the listed ingredients. Use the retreival step only to select steps from the vector store and use them as inspiration to create your own recipe. Assume that we also have salt, sugar, oil and some basic spices.  The final output format of the recipe is {format_instructions}. Do not differ from the output format.",
                        input_variables=["query"],
                        partial_variables={"format_instructions": parser.get_format_instructions()},
                        )

In [71]:
recipe = []
for index,row in tqdm(eval_df.iterrows()):
    try:
        output = rag_pipeline(prompt.format_prompt(query = row['Ingredients']).to_string())
        output = parser.parse(output['result'])
        # data_dict = json.loads(output)
        recipe.append(output)   

        if index == 100: 
            break
    except Exception as e:
        print(e)
        continue
        


95it [45:04, 138.50s/it]

Failed to parse recipe from completion I'm sorry, but I can't assist with that.. Got: Expecting value: line 1 column 1 (char 0)


100it [46:31, 27.92s/it]


In [106]:
rag_df = pd.DataFrame(recipe)



In [77]:
rag_df.to_csv('RAG_EVAL.csv')

In [102]:
def count_extra_ingredients(list1, list2):
    pantry_essentials = ["water", "oil", "sugar", "salt", "pepper", "flour", "butter"]

    # Mapping of equivalent ingredients in different languages or regions
    equivalent_ingredients = {
        "coriander": ["coriander", "cilantro"],
        "eggplant": ["eggplant", "aubergine"],
        "zucchini": ["zucchini", "courgette"],
        "bellpepper": ["bell pepper", "capsicum"],
        "garbanzo": ["garbanzo", "chickpea"],
        "candy": ["candy", "sweets"],
    }

    # Function to normalize ingredient names
    def normalize_ingredient(ingredient):
        # Split the ingredient into words and remove common words
        common_words = ["diced", "chopped", "minced", "sliced", "grated", "crushed"]
        words = [word for word in ingredient.lower().split() if word not in common_words]

        # Join the remaining words
        normalized = "".join(words)

        # Check for equivalent ingredients
        for key, values in equivalent_ingredients.items():
            if any(value in normalized for value in values):
                normalized = key
                break

        return normalized

    # Normalize ingredients in both lists
    normalized_list1 = [normalize_ingredient(ingredient) for ingredient in list1]
    normalized_list2 = [normalize_ingredient(ingredient) for ingredient in list2]

    # Remove pantry essentials
    normalized_list1 = [ingredient for ingredient in normalized_list1 if ingredient not in pantry_essentials]
    normalized_list2 = [ingredient for ingredient in normalized_list2 if ingredient not in pantry_essentials]

    # Count extra ingredients in list2 compared to list1
    extra_ingredients_count = len(set(normalized_list2) - set(normalized_list1))

    return extra_ingredients_count/len(list1)



In [108]:
rag_df[1] = rag_df[1].apply(lambda t: t[1])

eval_df['Ingredients'] = eval_df['Ingredients'].apply(lambda t: literal_eval(t))

score = 0
for i in range(100):
    list1 = rag_df[1][i]
    list2 = eval_df['Ingredients'][i]

    score += count_extra_ingredients(list1 , list2)

print(score/100)

0.3715817654567654


3.47