In [1]:
# Load from parent directory if not installed
import importlib
import os

if not importlib.util.find_spec("sammo"):
    import sys

    sys.path.append("../")
os.environ["CACHE_FILE"] = "cache/working_with_data.tsv"

# Basic Prompt Engineering

`SAMMO` has a variety of tools that make trying out different sets of prompts easy. Let's start by loading the same task as before.

In [8]:
# %load -r :27 _init.py
import pathlib
import sammo
from sammo.runners import OpenAIChat
from sammo.base import Template, EvaluationScore
from sammo.components import Output, GenerateText, ForEach, Union
from sammo.extractors import ExtractRegex
from sammo.data import DataTable
import json
import requests
import os

API_CONFIG_FILE = pathlib.Path().cwd().parent / "config" / "personal.openai"
API_CONFIG = ""
if API_CONFIG_FILE.exists():
    API_CONFIG = API_CONFIG_FILE
if not API_CONFIG:
    raise ValueError('Please set API_CONFIG to {"api_key": "YOUR_KEY"}')

_ = sammo.setup_logger("WARNING")  # we're only interested in warnings for now

runner = OpenAIChat(
    model_id="gpt-3.5-turbo-16k",
    api_config=API_CONFIG,
    cache=os.getenv("CACHE_FILE", "cache.tsv"),
    timeout=30,
)

In [3]:
import json
import requests


def load_data(
    url="https://github.com/google/BIG-bench/raw/main/bigbench/benchmark_tasks/implicatures/task.json",
):
    task = json.loads(requests.get(url).content)
    # convert label to single string
    for x in task["examples"]:
        x["output"] = max(x["target_scores"], key=x["target_scores"].get)

    return DataTable.from_records(
        task["examples"],
        input_fields="input",
        constants={"instructions": task["task_prefix"]},
    )


mydata = load_data()

Let's say we want to try out different instructions. For that, let's define an objective.

In [4]:
# %load -s accuracy _init.py
def accuracy(y_true: DataTable, y_pred: DataTable) -> EvaluationScore:
    y_true = y_true.outputs.values
    y_pred = y_pred.outputs.values
    n_correct = sum([y_p == y_t for y_p, y_t in zip(y_pred, y_true)])

    return EvaluationScore(n_correct / len(y_true))


Nothing special here - we simply count the number of correct labels and return an `EvaluationScore` object.

To try out different prompts, we need to describe the space of possible candidates. `SAMMO` does that by offering a number of operators, such as `one_of`.

In [5]:
from sammo.search import EnumerativeSearch
from sammo.search_op import one_of
from sammo.base import Template
from sammo.components import Output, GenerateText


def labeling_prompt_space():
    instructions = one_of(
        [
            "Does Speakers 2 answer mean yes or no to Speaker 1?",
            "Think about whether Speaker 2 should start with 'yes' or 'no'?",
        ],
        name="instr",
    )
    prompt = GenerateText(
        Template(
            "Instructions:{{{instructions}}}\nOutput labels: yes, no\nInput: {{{input}}}\nOutput:",
            instructions=instructions,
        )
    )
    return Output(prompt)

With the search space defined, we can now kick off the search:

In [6]:
sample = mydata.sample(10, seed=42)
searcher = EnumerativeSearch(runner, labeling_prompt_space, accuracy)
searcher.fit(sample)
searcher.show_report()

candidate[###################################]2/2[00:00<00:00] >> minibatches (total)[#######################]20/20[00:00<00:00]

Fitting log:
iteration    action                                             objective    costs                         parse_errors
-----------  -------------------------------------------------  -----------  ----------------------------  --------------
0            {'instr': "'Does Speakers 2 answer mean yes or no  0.6          {'input': 596, 'output': 10}  0.0
             to Speaker 1?'"}
1            {'instr': "'Think about whether Speaker 2 should   0.4          {'input': 626, 'output': 10}  0.0
             start with 'yes' or 'no'?'"}


Okay, we are doing a bit better! Let's see if changing the temperature would impact the result.

In [None]:
def labeling_prompt_space():
    instructions = one_of(
        [
            "Does Speakers 2 answer mean yes or no to Speaker 1?",
            "Think about whether Speaker 2 should start with 'yes' or 'no'?",
        ],
        name="instr",
    )
    prompt = GenerateText(
        Template(
            "Instructions:{{{instructions}}}\nOutput labels: yes, no\nInput: {{{input}}}\nOutput:",
            instructions=instructions,
        ),
        randomness=one_of([0.7, 1.0], name="randomness"),
    )
    return Output(prompt)


searcher = EnumerativeSearch(runner, labeling_prompt_space, accuracy)
searcher.fit(sample)
searcher.show_report()

candidate[                                   ]0/4[00:00<??:??] >> minibatches (total)[                        ]0/10[00:00<??:??]

Not bad! With `SAMMO`, we can quickly try out several alternatives if we want to manually tinker with different prompts.

However, `SAMMO` offers a much more powerful way of automatically optimizing prompts which we cover in the section after the next one.