In [None]:
import os

os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"

In [1]:
import sys
import os

project_path = os.path.abspath("../app")
if project_path not in sys.path:
    sys.path.append(project_path)

### Request Classification

In [3]:
from core.state_graph.states.main_graph.agent_state import AgentState
from core.state_graph.nodes.main_graph.analyze_and_route_query import analyze_and_route_query
from langchain_core.runnables import RunnableConfig
import json

with open("datasets/classify_dataset.json") as f:
    classify_dataset = json.load(f)

async def eval_classify_dataset():
    config = RunnableConfig()
    evaluation = 0
    for i, entry in enumerate(classify_dataset):
        state = AgentState(messages=entry["messages"])
        response = await analyze_and_route_query(state, config=config)
        print(f"{i+1} - expected: {entry['expected_classification']} - classified as: {response['router'].type}")
        evaluation += (response["router"].type == entry['expected_classification'])
    return f"{evaluation}/{len(classify_dataset)}"

evaluation = await eval_classify_dataset()
evaluation

---ANALYZE AND ROUTE QUERY---
MESSAGES: [{'role': 'user', 'content': 'I want an easy salty recipe'}]
1 - expected: more-info - classified as: more-info
---ANALYZE AND ROUTE QUERY---
MESSAGES: [{'role': 'user', 'content': 'I want an easy salty recipe'}, {'role': 'ai', 'content': 'Do you have any other preferences?'}, {'role': 'user', 'content': 'No'}]
2 - expected: valid - classified as: valid
---ANALYZE AND ROUTE QUERY---
MESSAGES: [{'role': 'user', 'content': 'how much is 2+2?'}]
3 - expected: general - classified as: general
---ANALYZE AND ROUTE QUERY---
MESSAGES: [{'role': 'user', 'content': 'what is the weather like in Munich?'}]
4 - expected: general - classified as: general
---ANALYZE AND ROUTE QUERY---
MESSAGES: [{'role': 'user', 'content': 'Suggest me some recipes with pasta for lunch. It must be a main course of medium difficulty and it has to have less than 600 calories. I have no other preferences'}]
5 - expected: valid - classified as: valid
---ANALYZE AND ROUTE QUERY---
ME

'10/10'

### Init Graph

In [4]:
import os

os.environ["NEO4J_URL"] = "bolt://localhost:7687"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "12345678"

### Plan Generation

In [5]:
from core.state_graph.states.main_graph.agent_state import AgentState
from core.state_graph.nodes.main_graph.create_research_plan import create_research_plan
from core.knowledge_graph.graph import neo4j_graph
from langchain_core.runnables import RunnableConfig
from langchain_core.messages.ai import AIMessage
from langchain_core.messages.human import HumanMessage
from langchain.chat_models import init_chat_model
from typing import TypedDict

class JudgeResponse(TypedDict):
    overall_score: int
    logic: str

plan_judge_prompt = """You are an expert evaluator specializing in knowledge retrieval and reasoning over graph databases. 
Your task is to evaluate a step-by-step research plan produced by a Food Assistant designed to retrieve information from a Neo4j graph, based on a user request.

You will assess the quality of the generated plan in comparison to the expected plan. Focus on the following criteria:
1. **Correctness** - Are the steps logically sound and appropriate for the retrieval task?
2. **Use of Retrieval Methods** - Are Semantic Search and Query Search used correctly and in accordance with the rules provided?
3. **Step Dependencies** - Are dependencies between steps clearly described and accurate, especially when context from previous steps is required?
4. **Alignment with Graph Schema** - Does the plan respect the graph schema, including available node labels, properties, and known values?
5. **Completeness and Efficiency** - Does the plan include all necessary steps without redundancy or omission?

Provide a structured evaluation based on these criteria. Use the following format:

Evaluation:
- Correctness: [Score 0-2] - [Justification]
- Use of Retrieval Methods: [Score 0-2] - [Justification]
- Step Dependencies: [Score 0-2] - [Justification]
- Schema Alignment: [Score 0-2] - [Justification]
- Completeness & Efficiency: [Score 0-2] - [Justification]

Then conclude with an overall judgment, that is the sum of all the evaluation parts, from 0 to 10.

Schema:
<schema>
{schema}
</schema>

Expected Plan:
<expected_plan>
{expected_plan}
</expected_plan>

Generated Plan:
<generated_plan>
{generated_plan}
</generated_plan>
"""

with open("datasets/plan_dataset.json") as f:
    plan_dataset = json.load(f)

async def eval_plan_dataset():
    config = RunnableConfig()
    tot_score = 0
    for i, entry in enumerate(plan_dataset):
        state = AgentState(messages=entry["messages"])
        response = await create_research_plan(state, config=config)

        formatted_steps = []
        for i, step in enumerate(response["steps"]):
            formatted_steps.append(f"{i+1}. ({step['type']}): {step['question']}")
        generated_plan = "\n".join(formatted_steps)

        system_prompt = plan_judge_prompt.format(schema=neo4j_graph.get_schema, expected_plan=entry["expected_plan"], generated_plan=generated_plan)
        messages = [{"role": "system", "content": system_prompt}]
        eval_response: JudgeResponse = model.with_structured_output(JudgeResponse).invoke(messages)
        
        print(eval_response)
        tot_score += eval_response["overall_score"]
    
    return f"Avg score: {tot_score / len(plan_dataset)}"

model = init_chat_model(model="openai:gpt-4o", temperature=0.0, streaming=True)
evaluation = await eval_plan_dataset()
evaluation

---PLAN GENERATION---
Plan
1. (query_search): Retrieve all recipes with difficulty 'easy'.
2. (query_search): From the easy recipes retrieved, find those that have a relationship with the 'Salty' flavor.
Reduced Plan
1. (query_search): Retrieve all easy recipes with a 'Salty' flavor.
Reviewed Plan
1. (query_search): Retrieve all easy recipes with a 'Salty' flavor.
{'logic': "Evaluation:\n- Correctness: 2 - The generated plan correctly identifies the need to retrieve recipes with 'easy' difficulty and 'Salty' flavor, which aligns with the user request.\n- Use of Retrieval Methods: 2 - The plan uses query_search appropriately to retrieve recipes based on specific properties and relationships, which is suitable for the task.\n- Step Dependencies: 2 - The plan is a single step and does not require dependencies between multiple steps, which is appropriate for the simplicity of the task.\n- Schema Alignment: 2 - The plan respects the graph schema by correctly identifying the 'difficulty' and

'Avg score: 7.181818181818182'