In [10]:
%pip install click anthropic pymemgpt pystache~=0.6.5 openai faker~=20.1.0 colorama~=0.4.6 \
pytest time-machine==2.13.0 tiktoken~=0.5.2 goodai-ltm==0.4.4 gdown==4.7.1 tqdm \
pyyaml~=6.0.1 jinja2~=3.1.2 Werkzeug==2.2.2 flask==2.2.3 browser-cookie3 \
langchain==0.1.1 langchain-openai==0.0.2 rouge_score==0.1.2 litellm zstd==1.5.5.1 \
humanize==4.9.0 matplotlib==3.8.4 google-generativeai==0.4.1

Note: you may need to restart the kernel to use updated packages.


In [14]:
import sys, os
sys.path.insert(0, "goodai-ltm-benchmark")

from dataclasses import dataclass
from typing import List, Tuple
import pystache
from dataset_interfaces.interface import DatasetInterface, TestExample

COLOURS = [
    "Red",
    "White",
    "Blue",
    "Green",
    "Yellow",
    "Pink",
    "Magenta",
    "Cyan",
    "Purple",
    "Tan",
    "Khaki",
    "Beige",
    "Cream",
    "Brown",
    "Olive",
    # "None",
]

STATEMENTS = [
    "My favourite colour is {{colour}}.",
    "{{colour}} is my favourite colour.",
    "My favourite colour could be described as {{colour}}.",
    "The name of my favourite colour is {{colour}}.",
]


@dataclass
class ColourDataset(DatasetInterface):
    name: str = "Colours"
    description: str = "Tell the agent what your favourite colour is multiple time, then ask it what that colour is."
    question: str = "What is my favourite colour?"
    colour_changes: int = 3
    memory_span: int = 100 

    def generate_examples(self, num_examples):
        examples = []

        for _ in range(num_examples):
            is_question = []
            colours = []
            script = []
            renderer = pystache.Renderer()

            for change in range(self.colour_changes):
                colour = self.random.choice(COLOURS)
                if colour == "None":
                    name_stmt = "I have no favourite colour."
                else:
                    name_stmt = renderer.render(self.random.choice(STATEMENTS), {"colour": colour})
                colours.append(colour)
                script.append(name_stmt)
                is_question.append(False)

            script.append(self.question)
            is_question.append(True)
            answer_list = [colours[-1]]
            example = TestExample(
                dataset_generator=self,
                script=script,
                expected_responses=answer_list,
                is_question=is_question,
            )
            examples.append(example)

        return examples

    def evaluate_correct(
        self, questions: List[str], responses: List[str], expected_answers: List[str]
    ) -> Tuple[int, int, List[str]]:
        color = expected_answers[0].lower()
        if color in responses[-1].lower():
            return 1, 1, [f'"{color}" is in the response.']
        return 0, 1, [f'"{color}" is NOT in the response.']


In [15]:
# Initialize the dataset
dataset = ColourDataset()
examples = dataset.generate_examples(num_examples=200)  # adjust number as needed

# Collect all sentences to translate
lines = []
for ex in examples:
    lines.extend(ex.script)

# Collect the correct answers
answers = []
for ex in examples:
    answers.extend(ex.expected_responses)

# Save example sentences to a txt file
with open("test/colours_sentences.txt", "w") as f:
    for line in lines:
        clean_line = line.strip()
        f.write(clean_line + "\n")

# Save example answers to a txt file
with open("test/colours_answers.txt", "w") as f:
    for answer in answers:
        clean_answer = answer.strip()
        f.write(answer + "\n")

print(f"Saved {len(lines)} sentences to test/colours_sentences.txt")
print(f"Saved {len(answers)} answers to test/colours_answers.txt")

Saved 800 sentences to test/colours_sentences.txt
Saved 200 answers to test/colours_answers.txt
