# Generate Evaluation Dataset

This Notebook generates a jsonl file that will be used to evaluate the efficacy of a prompt that is designed to extract long-term memories from a message.

For instance, the following sentence:

- I work long hours during the week, so having a meal plan that is quick and easy to prepare is crucial for me.

Should be converted into a memory:

- Works long hours
- Meal plans should be quick and easy

In order to evaluate our prompt, we will be generating the following:

- Inputs: we will generate a series of possible messages that a user would share with the AI about meal planning
- Existing memories: to test for each input, since we will either create new memories, update old memories, delete old memories, or do nothing depending on the existing memories
- Expected output: the output we expect based on the input and existing memories, so that we can test if our output matches expected output
- Bad output: an exmaple of output we would not expect based on the input and existing memories, to test the system’s ability to discriminate between correct and incorrect outputs

Here is an example that we will be generating:

{
    "input": "I work long hours during the week, so having a meal plan that is quick and easy to prepare is crucial for me.", 
    "memories": ["Meal plan needs to be slow and difficult to prepare"], 
    "desired_response": [{"knowledge": "Meal plan needs to be quick and easy to prepare", "category": "Attribute", "action": "Update", "knowledge_old": "Meal plan needs to be slow and difficult to prepare"}, {"knowledge": "Works long hours during the week", "category": "Attribute", "action": "Create"}],
    "bad_response": [{"knowledge": "Meal plan needs to be elaborate and time-consuming to prepare", "category": "Attribute", "action": "Update", "knowledge_old": "Meal plan needs to be slow and difficult to prepare"}, {"knowledge": "Works long hours during the week", "category": "Like", "action": "Create"}]
}


### Notebook setup

In [1]:
# %pip install openai==1.12.0 langchain==0.1.6 langchain_openai==0.0.5

In [2]:
from dotenv import load_dotenv
import os

# Load .env file
load_dotenv('../.env')

# Set model variables
OPENAI_BASE_URL = "https://api.openai.com/v1"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_ORGANIZATION = os.getenv("OPENAI_ORGANIZATION")

LANGCHAIN_TRACING_V2=False

### Define the Run class
This will help us manage our iterative creation of run data

In [3]:
import json


class Run:
    def __init__(
        self,
        input,
        desired_response="",
        bad_response="",
        memories=[],
        desired_assistant_message="",
    ):
        self.input = input
        self.memories = memories
        self.desired_response = desired_response
        self.bad_response = bad_response
        self.desired_assistant_message = desired_assistant_message

    def update_memories(self, new_memories):
        self.memories = self.extract_knowledge(new_memories)

    def update_desired_response(self, response):
        self.desired_response = self.extract_arguments(response)

        function_call = response["tool_calls"][0]["function"]

        self.desired_assistant_message = {
            "role": "assistant",
            "function_call": {
                "name": function_call["name"],
                "arguments": function_call["arguments"],
            },
        }

    def update_bad_response(self, response):
        self.desired_response = self.extract_arguments(response)

    def extract_arguments(self, data):
        arguments_list = []

        # Check if data is a dictionary and contains the key 'tool_calls'
        if isinstance(data, dict) and "tool_calls" in data:
            for tool_call in data["tool_calls"]:
                # Extracting the 'arguments' from the tool_call
                function_info = tool_call.get("function", {})
                arguments = function_info.get("arguments", "")

                # If arguments is a string, attempt to parse it as JSON
                if isinstance(arguments, str):
                    try:
                        arguments_json = json.loads(arguments)
                        arguments_list.append(arguments_json)
                    except json.JSONDecodeError:
                        print("Error decoding JSON from arguments:", arguments)

        return arguments_list

    def extract_knowledge(self, memories):
        knowledge_list = []
        if "tool_calls" in memories and isinstance(memories["tool_calls"], list):
            for item in memories["tool_calls"]:
                function = item.get("function", {})
                arguments = function.get("arguments", "")

                try:
                    arguments_json = json.loads(arguments)
                    knowledge = arguments_json.get("knowledge", "")
                    if knowledge:
                        knowledge_list.append(knowledge)
                except json.JSONDecodeError:
                    print("Error decoding JSON from arguments:", arguments)
        return knowledge_list

    def to_dict(self):
        return {
            "input": self.input,
            "memories": self.memories,
            "desired_response": self.desired_response,
            "bad_response": self.bad_response,
            "desired_assistant_message": self.desired_assistant_message
        }

In [4]:
# A simple test to confirm our setup is correct
run = Run(input="Your input data")
run.update_memories(
    {
        "tool_calls": [
            {
                "index": 0,
                "id": "call_NqdC7z7UBDUIRjZ8lQbTNRkr",
                "function": {
                    "arguments": '{"knowledge":"I don\'t eat fish","category":"Dislike","action":"Create"}',
                    "name": "Knowledge_Modifier",
                },
                "type": "function",
            }
        ]
    }
)

### Set up the constants

In [5]:
EVAL_FILE_PATH = "./data/eval_dataset.jsonl"
FINETUNE_FILE_PATH = "./data/finetune_dataset.jsonl"

NUM_RUNS = 10  # Adjust the number of runs as needed

In [6]:
from enum import Enum


class Category(str, Enum):
    Food_Allergy = "Allergy"
    Food_Like = "Like"
    Food_Dislike = "Dislike"
    Family_Attribute = "Attribute"


class Action(str, Enum):
    Create = "Create"
    Update = "Update"
    Delete = "Delete"

### Step 1 - Create the input text

In [7]:
# Set up the chain for generating inputs

from langchain_openai.chat_models import ChatOpenAI
from langchain.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
)

SYSTEM_TEMPLATE_GENERATE_INPUT_TEXT = """
You are a person who is trying to meal-plan for a week.

Before responding, come up with a persona, backstory, and goal for the person you are helping:

- First, come up with your persona: are you a mother, father, boyfriend, wife, single adult? Do you have kids? How many people are your family?
- Second, come up with a backstory: are you a busy professional? A stay-at-home parent? A college student?
- Third, come up with a goal: are you trying to eat healthier? Save money? Save time?

Now that you have identified your persona, imagine you are in the middle of meal-planning and someone just asked you the following question:

{question}

Answer the question in a way that a person with your exact backstory might if it was in the middle of a long conversation. You might simply be answering the question, but you may also be referencing something from earlier in the conversation or even providing extra context to explain your answer.

In general, your answers should be pretty short (1 or 2 sentences).

Just answer the question, don't share the persona, backstory, or goal you came up with. We will use that information to help guide the AI to give you a better response.

Your response should only contain 1 key piece of information. For example "I like X food" instead of "I like X and Y foods", but it should exist within a 1 or 2 sentence response. Your goal is to help train and evaluate how good an AI is at extracting this bit of information from a conversation, so don't be way too obvious about your response.

Be super imaginative with your answers. Don't just provide a boring answer. This will help the AI learn to be more creative and interesting in its responses.

I will reward you if you provide an answer I've never seen before.
"""


# Get the prompt to use - you can modify this!
prompt = ChatPromptTemplate.from_messages(
    [
        SystemMessagePromptTemplate.from_template(SYSTEM_TEMPLATE_GENERATE_INPUT_TEXT),
    ]
)

# Choose the LLM that will drive the agent
llm = ChatOpenAI(
    model="gpt-3.5-turbo-0125",
    temperature=1.0,
)

generate_eval_runnable = prompt | llm

In [8]:
# Generate NUM_RUNS inputs and save them to a JSONL file

import json

# Questions for the input
QUESTION_1 = "Do you have any dietary restrictions?"
QUESTION_2 = "What kind of food do you like?"
QUESTION_3 = "What should I know about you that would make your meal plan more helpful every week?"

runs = []

for i in range(NUM_RUNS):
    # Randomly select one of the questions
    if i % 3 == 0:
        question_input = {"question": QUESTION_1}
    elif i % 3 == 1:
        question_input = {"question": QUESTION_2}
    else:
        question_input = {"question": QUESTION_3}

    # Assuming generate_eval_runnable.invoke returns a response with a .content attribute
    response = generate_eval_runnable.invoke(question_input)

    # Create a Run instance and append it to the runs list
    run = Run(input=response.content)
    runs.append(run.to_dict())

# Save the runs to a JSONL file
with open(EVAL_FILE_PATH, "w") as outfile:
    for run in runs:
        json.dump(run, outfile)
        outfile.write("\n")

print(f"Saved {len(runs)} runs to {EVAL_FILE_PATH}")

Saved 10 runs to eval_dataset.jsonl


### Step 2 - Generate memories for each input

In [9]:
# Set up the tool for extracting memories

from langchain.pydantic_v1 import BaseModel, Field
from langchain.tools import StructuredTool
from typing import Optional


class AddKnowledge(BaseModel):
    knowledge: str = Field(
        ...,
        description="Condensed bit of knowledge to be saved for future reference in the format [person] [fact] (e.g. Husband doesn't like tuna, I am allergic to shellfish, etc)",
    )
    knowledge_old: Optional[str] = Field(
        None,
        description="If updating or deleting record, the complete, exact phrase that needs to be modified",
    )
    category: Category = Field(
        ..., description="Category that this knowledge belongs to"
    )
    action: Action = Field(
        ...,
        description="Whether this knowledge is adding a new record, updating a record, or deleting a record",
    )


def handle_action(
    knowledge: str,
    category: str,
    action: str,
    knowledge_old: str = "",
) -> dict:
    print("Handling Knowledge: ", knowledge, knowledge_old, category, action)


knowledge_modifier = StructuredTool.from_function(
    func=handle_action,
    name="Knowledge_Modifier",
    description="Add, update, or delete a bit of knowledge",
    args_schema=AddKnowledge,
)

In [10]:
# Set up the chain for extracting memories

from langchain_openai.chat_models import ChatOpenAI
from langchain.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    MessagesPlaceholder,
)
from langchain_core.utils.function_calling import convert_to_openai_function

SYSTEM_PROMPT_EXPECTED_RESPONSE = """
You are a supervisor managing a team of knowledge eperts.

Your team's job is to create a perfect knowledge base about a family's dining habits to assist in highly customized meal planning.

The knowledge base should ultimately consist of many discrete pieces of information that add up to a rich persona (e.g. I like pasta; I am allergic to shellfish; I don't eat mussels; I live in Austin, Texas; I have a husband and 2 children aged 5 and 7).

Every time you receive a message, you will evaluate if it has any information worth recording in the knowledge base.

A message may contain multiple pieces of information that should be saved separately.

You are only interested in the following categories of information:

1. The family's food allergies (for example: a dairy or soy allergy) - These are important to know because they can be life-threatening. Only log something as an allergy if you are certain it is an allergy and not just a dislike.
2. Foods the family likes (for example: likes pasta) - These are important to know because they can help you plan meals, but are not life-threatening.
3. Foods the family dislikes (for example: doesn't eat mussels or rarely eats beef) - These are important to know because they can help you plan meals, but are not life-threatening.
4. Attributes about the family that may impact weekly meal planning (for example: lives in Austin, has a husband and 2 children, has a garden, likes big lunches, etc.)

When you receive a message, you perform a sequence of steps consisting of:

1. Analyze the most recent Human message for information. You will see multiple messages for context, but we are only looking for new information in the most recent message.
2. Compare this to the knowledge you already have.
3. Determine if this is new knowledge, an update to old knowledge that now needs to change, or should result in deleting information that is not correct. It's possible that a food you previously wrote as a dislike might now be a like, or that a family member who previously liked a food now dislikes it - those examples would require an update.

Here are the existing bits of information that we have about the family.

```
{memories}
```

Call the right tools to save the information, then respond with DONE. If you identiy multiple pieces of information, call everything at once. You only have one chance to call tools.

I will tip you $20 if you are perfect, and I will fine you $40 if you miss any important information or change any incorrect information.

Take a deep breath, think step by step, and then analyze the following message:
"""

prompt = ChatPromptTemplate.from_messages(
    [
        SystemMessagePromptTemplate.from_template(SYSTEM_PROMPT_EXPECTED_RESPONSE),
        MessagesPlaceholder(variable_name="messages"),
    ]
)

# Choose the LLM that will drive the agent
llm = ChatOpenAI(
    model="gpt-3.5-turbo-0125",
    # model="gpt-4-0125-preview",
    streaming=True,
    temperature=0.0,
)

# Create the tools to bind to the model
agent_tools = [knowledge_modifier]
tools = [convert_to_openai_function(t) for t in agent_tools]

knowledge_master_runnable = prompt | llm.bind_tools(tools)

In [11]:
# Set up the chain for mutating a memory

from langchain_openai.chat_models import ChatOpenAI
from langchain.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
)

SYSTEM_TEMPLATE_MUTATE_MEMORY = """
Your job is to mutate a string of text to change the meaning of it by only changing or modifying a single word.

This may take a few possible paths:
- You might make the meaning the opposite of the original meaning
- You might change the intensity of the meaning (from like to love, or from dislike to hate)
- You might take an allergy and just make it a dislike, or vice versa

But you will not change the subject or object in the sentence, just the relationship between them.

Here is the sentence to modify: 

```
{memory}
```

Now return a string that contains the modified sentence and nothing else.
"""

# Get the prompt to use - you can modify this!
prompt = ChatPromptTemplate.from_messages(
    [
        SystemMessagePromptTemplate.from_template(SYSTEM_TEMPLATE_MUTATE_MEMORY),
    ]
)

# Choose the LLM that will drive the agent
llm = ChatOpenAI(
    model="gpt-3.5-turbo-0125",
    temperature=1.0,
)

mutate_memory_runnable = prompt | llm

In [12]:
# Iterate through each input and generate memories for each

import json
from langchain_core.messages import HumanMessage
import random


def generate_memories_for_eval_dataset(file_name):
    updated_runs = []

    with open(file_name, "r") as infile:
        lines = infile.readlines()
        data_entries = [json.loads(line.strip()) for line in lines]

    for index, data in enumerate(data_entries):
        if "input" in data:
            if index > 2:
                choice = random.randint(1, 4)
            else:
                choice = random.randint(1, 3)
            # For each index, we either want to:
            # 1. Generate accurate memories
            # 2. Generate accurate memories and then mutate one of them
            # 3. Generate no memories
            # 4. Generate memories from a combination of 1 or 2 other entries
            if choice < 3:
                # Create memory from input
                messages = [HumanMessage(content=data.get("input"))]
                response = knowledge_master_runnable.invoke(
                    {"messages": messages, "memories": []}
                )

                run_instance = Run(input=data.get("input"))

                run_instance.update_memories(response.additional_kwargs)

                # Some of the time, we want to create a mutated version of that memory so we can test an update function call
                if choice > 1 and run_instance.memories:
                    # Select a random memory to mutate
                    selected_memory = random.choice(run_instance.memories)

                    # Find the index of the selected memory
                    index = run_instance.memories.index(selected_memory)

                    mutated_memory = mutate_memory_runnable.invoke({"memory": selected_memory})

                    # Replace the selected memory with the mutated memory
                    run_instance.memories[index] = mutated_memory.content
            elif choice == 3:
                # No memory
                run_instance = Run(input=data.get("input"))
                run_instance.update_memories("")
            else:
                # Grab and join inputs from 1 or 2 other entries
                other_entries = random.sample(data_entries[:index], random.randint(1, 2))
                combined_input = " ".join(
                    [entry.get("input", "") for entry in other_entries]
                )
                messages = [HumanMessage(content=combined_input)]
                response = knowledge_master_runnable.invoke(
                    {"messages": messages, "memories": []}
                )
                run_instance = Run(input=data.get("input"))
                run_instance.update_memories(response.additional_kwargs)

            updated_runs.append(run_instance.to_dict())

    # Rewrite the updated runs back to the file
    with open(file_name, "w") as outfile:
        for run in updated_runs:
            json.dump(run, outfile)
            outfile.write("\n")

    print(f"Updated {len(updated_runs)} runs in {file_name}")

In [13]:
# Usage
generate_memories_for_eval_dataset(EVAL_FILE_PATH)

Updated 10 runs in eval_dataset.jsonl


### Step 3 - Generate expected output based on inputs and memories

In [14]:
# Loop through each row and generate an expected output with the runnable we have already defined to generate memories

import json
from langchain_core.messages import HumanMessage


def generate_expected_output_for_eval_dataset(file_name):
    updated_runs = []

    with open(file_name, "r") as infile:
        for line in infile:
            data = json.loads(line.strip())
            if "input" in data:
                messages = [HumanMessage(content=data.get("input"))]
                memories = data.get("memories", [])
                # Invoke the knowledge_master_runnable with the current input
                response = knowledge_master_runnable.invoke(
                    {"messages": messages, "memories": memories}
                )

                # Create a Run instance and update the expected output
                run_instance = Run(
                    input=data.get("input"),
                    memories=memories,
                )

                run_instance.update_desired_response(response.additional_kwargs)

                updated_runs.append(run_instance.to_dict())

    # Rewrite the updated runs back to the file
    with open(file_name, "w") as outfile:
        for run in updated_runs:
            json.dump(run, outfile)
            outfile.write("\n")

    print(f"Updated {len(updated_runs)} runs in {file_name}")


# Usage
generate_expected_output_for_eval_dataset(EVAL_FILE_PATH)

Updated 10 runs in eval_dataset.jsonl


### Step 4 - Generate the bad output based on our expected outputs

In [15]:
# Create a chain to subtly mutate our desired output

from langchain_openai.chat_models import ChatOpenAI
from langchain.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
)

SYSTEM_TEMPLATE_MUTATE_RESPONSE = """
Your job is to mutate a string of text to subtly change the meaning of it by only changing or modifying a single word.

This may take a few possible paths:
- You might make the meaning the opposite of the original meaning
- You might change the subject to be something different
- You might modify the verb to make it stronger, weaker, or completely different
- You might change the complement to be something different

The end sentence should still make sense, it should just be a different sentence with different original meaning than the original because of the word you changed.

Here is the sentence to modify: 

```
{desired_response}
```

Now return a string that contains the modified sentence and nothing else.
"""


# Get the prompt to use - you can modify this!
prompt = ChatPromptTemplate.from_messages(
    [
        SystemMessagePromptTemplate.from_template(SYSTEM_TEMPLATE_MUTATE_RESPONSE),
    ]
)

# Choose the LLM that will drive the agent
llm = ChatOpenAI(
    model="gpt-3.5-turbo-0125",
    temperature=0.4,
)

mutation_runnable = prompt | llm

In [16]:
# Iterate through each row and generate a bad output

from copy import deepcopy


def generate_bad_output_for_eval_dataset(file_name):
    updated_runs = []

    with open(file_name, "r") as infile:
        for line in infile:
            data = json.loads(line.strip())
            if "desired_response" in data:
                bad_responses = deepcopy(
                    data["desired_response"]
                )

                for response in bad_responses:
                    choice = random.randint(1, 4)
                    # We either want to:
                    # 1. Change the content of category to a different one
                    # 2. Change the action to a different one
                    # 3. Mutate the contents of knowledge

                    if choice == 1:
                        # Change the content of category to a different one
                        current_category = Category(response.get("category", ""))
                        all_categories = [c for c in Category if c != current_category]
                        response["category"] = random.choice(all_categories).value

                    elif choice == 2:
                        # Change the action to a different one
                        current_action = Action(response.get("action", ""))
                        all_actions = [a for a in Action if a != current_action]
                        response["action"] = random.choice(all_actions).value

                    else:
                        # Mutate the contents of knowledge
                        inference = mutation_runnable.invoke(
                            {"desired_response": response.get("knowledge", "")}
                        )
                        response["knowledge"] = inference.content

                data["bad_response"] = bad_responses
                updated_runs.append(data)

    # Rewrite the updated runs back to the file
    with open(file_name, "w") as outfile:
        for run in updated_runs:
            json.dump(run, outfile)
            outfile.write("\n")

    print(f"Updated {len(updated_runs)} bad responses in {file_name}")


# Usage
generate_bad_output_for_eval_dataset(EVAL_FILE_PATH)

Updated 10 bad responses in eval_dataset.jsonl


### Step 5 - Generate Fine-tuning data

In [17]:
import json


def generate_data_for_finetuning(input_file_name, output_file_name):
    with open(input_file_name, "r") as infile, open(output_file_name, "w") as outfile:
        for line in infile:
            data = json.loads(line.strip())

            # Extract user content and assistant message
            user_content = data.get("input", "")  # Replace with actual key
            assistant_message = data.get(
                "desired_assistant_message", ""
            )  # Replace with actual key

            # Create new record
            new_record = {
                "messages": [
                    {"role": "user", "content": user_content},
                    assistant_message,
                ],
                "functions": [
                    {
                        "name": "Knowledge_Modifier",
                        "description": "Knowledge_Modifier(knowledge: str, category: str, action: str, knowledge_old: str = '') -> dict - Add, update, or delete a bit of knowledge",
                        "parameters": {
                            "type": "object",
                            "properties": {
                                "knowledge": {
                                    "description": "Condensed bit of knowledge to be saved for future reference in the format [person] [fact] (e.g. Husband doesn't like tuna, I am allergic to shellfish, etc)",
                                    "type": "string",
                                },
                                "knowledge_old": {
                                    "description": "If updating or deleting record, the complete, exact phrase that needs to be modified",
                                    "type": "string",
                                },
                                "category": {
                                    "description": "Category that this knowledge belongs to",
                                    "allOf": [
                                        {
                                            "title": "Category",
                                            "description": "An enumeration.",
                                            "enum": [
                                                "Allergy",
                                                "Like",
                                                "Dislike",
                                                "Attribute",
                                            ],
                                            "type": "string",
                                        }
                                    ],
                                },
                                "action": {
                                    "description": "Whether this knowledge is adding a new record, updating a record, or deleting a record",
                                    "allOf": [
                                        {
                                            "title": "Action",
                                            "description": "An enumeration.",
                                            "enum": ["Create", "Update", "Delete"],
                                            "type": "string",
                                        }
                                    ],
                                },
                            },
                            "required": ["knowledge", "category", "action"],
                        },
                    }
                ],
            }

            # Write new record to output file
            json.dump(new_record, outfile)
            outfile.write("\n")


# Usage
generate_data_for_finetuning(EVAL_FILE_PATH, FINETUNE_FILE_PATH)