In [1]:
import os
from dotenv import load_dotenv

os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk-24"

load_dotenv()
"""
I upload a private key that grants me access to GPT models for the LLM, as most of the LangGraph tools are based on interaction with ChatGPT.
I cannot upload the key here, as it is a private key.
"""
openai_api_key = os.getenv("OPENAI_API_KEY")

In [2]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableSequence
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
df = pd.read_csv("C:\\Users\\figio\Desktop\\tryKaggle\\archive\\sampled_dataset.csv")
df = df.rename(columns={'Unnamed: 0': 'id'})

  df = pd.read_csv("C:\\Users\\figio\Desktop\\tryKaggle\\archive\\sampled_dataset.csv")


In [4]:
prompt = """
You are a culinary expert and food classification assistant. Your task is to determine the appropriate course type for a given recipe. Based on the ingredients and cooking instructions, classify the recipe into one of the following categories:

- Antipasto: Typically served as a starter or appetizer before the main meal. Often light and small in portion.

- Primo: Usually the first main course in a traditional meal. This includes pasta, risotto, soups, or other starch-based dishes.

- Secondo: The main course, generally focused on a primary protein such as meat, fish, or a substantial vegetarian alternative.

- Dessert: A sweet course served at the end of the meal.

- Contorno: A side dish, often served alongside the secondo, which can include vegetables or salads.

- Ambiguous: A dish that can be served as either a primo or secondo, depending on the context and preparation.

Consider both ingredients and preparation methods. Classify based on traditional culinary structures, especially Italian cuisine, but be flexible to international influences when appropriate. Output only the category name.
Don't make anything up. It is very difficult to categorize a dish sometimes, especially if they are not Italian or French. In these cases, the ambiguous category is the right one.

The recipe is: {recipe}

output: 
{{
	"Category": string  // The category of the recipe, such as Antipasto, Primo, Secondo, Dessert, Ambiguous, Contorno
	"Reasoning": string  // The reasoning behind the classification, explaining why the recipe fits into the chosen category
}}

IMPORTANT:
do not include any other information, just the category name and the reasoning. Use the format above, i don't want any other information.
"""

In [5]:
response_schemas = [
    ResponseSchema(
        name='Category',
        description='The category of the recipe, such as Antipasto, Primo, Secondo, Dessert, Contorno, Ambiguous',
        type='string',
        required=True,
    ),
    ResponseSchema(
        name='Reasoning',
        description='The reasoning behind the classification, explaining why the recipe fits into the chosen category',
        type='string',
        required=False,
    )
]

In [6]:
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

In [7]:
prompt_ = ChatPromptTemplate.from_messages(
    [('system', prompt)]
).partial(format_instructions=output_parser.get_format_instructions())

In [8]:
print(output_parser.get_format_instructions())

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"Category": string  // The category of the recipe, such as Antipasto, Primo, Secondo, Dessert, Contorno, Ambiguous
	"Reasoning": string  // The reasoning behind the classification, explaining why the recipe fits into the chosen category
}
```


In [9]:
llm = ChatOpenAI(model="gpt-4", temperature=0.3)
recipe_chain = prompt_ | llm | output_parser

In [10]:
from tqdm import tqdm

# Load the first 1000 rows
df_sample = df.iloc[3000:4000]

# Initialize an empty list to store the "Category" results
categories = []
# Iterate through the sample dataset and make predictions with a progress bar
for _, row in tqdm(df_sample.iterrows(), total=len(df_sample), desc="Processing Recipes"):
    recipe = row['ingredients'] + " " + row['directions']
    try:
        response = recipe_chain.invoke({"recipe": recipe})
        categories.append(response['Category'])
    except Exception as e:
        print(f"Error processing recipe id {row['id']}: {e}")
        categories.append("Ambiguous")  # Append "Ambiguo" in case of an error
        continue

# The "categories" list now contains the predictions

Processing Recipes:  57%|█████▋    | 568/1000 [46:09<35:06,  4.88s/it]    


KeyboardInterrupt: 

In [21]:
# Create a new DataFrame with the original info and the prediction
df_sample2 = df.iloc[3000:3568]

results_df = df_sample2[['id', 'title', 'ingredients', 'directions']].copy()

print(results_df.head())
results_df['Category'] = categories

print(results_df.head())
# Define the output file path
output_file = "classified_recipes_4.csv"

# Create or overwrite the CSV file
results_df.to_csv(output_file, index=False)

print(f"Results saved to {output_file}")

           id                                  title  \
3000   374649                           Calico Salad   
3001   786781                       Tofu Herb Spread   
3002  1151634                 Egg Salad With A Twist   
3003  1480817  Pork Braised In Milk And Cream Recipe   
3004  1330322            Gluten Free Rosemary Bread    

                                            ingredients  \
3000  ["1 l. pkg. frozen mixed vegetables", "1 s. bo...   
3001  ["2 cloves garlic", "1 c. fresh basil, loosely...   
3002  ["4 large eggs", "2 tablespoons green onions",...   
3003  ["2 tablespoon butter, divided", "1 tablespoon...   
3004  ["1 egg", "1/3 cup egg whites", "1 tablespoon ...   

                                             directions  
3000  ['Cook vegetables in salted boiling water 8 mi...  
3001  ['Place the garlic, basil, parsley, celery and...  
3002  ['For best results in making the eggs easier t...  
3003  ['Combine 1 tablespoon butter and olive oil ov...  
3004  ['mix wet an

In [22]:
# Read the file again
results_df = pd.read_csv(output_file)

# Compute statistics about the classes
class_stats = results_df['Category'].value_counts()

# Print the statistics
print(class_stats)



Category
Dessert      182
Secondo      147
Primo         89
Antipasto     78
Contorno      61
Ambiguous     11
Name: count, dtype: int64
