## Installing SAMMO

1. In no particular order:
    - Install poetry.
    - Download sammo_prerelease.zip and unzip.
2. Navigate to the unzipped SAMMO folder in command shell.
3. Create and activate a new virtual environment or conda environment.
4. Then
```
poetry install
```

Find the preliminary documentation in `_build\html\index.html`. 

# Experiment 1: Single-turn responses

Let's load the data

In [1]:
# Load from parent directory if not installed
import importlib

if not importlib.util.find_spec("sammo"):
    import sys

    sys.path.append("../")

In [2]:
import requests
import json
from sammo.data import DataTable
import pathlib
import sammo
import pandas as pd
from sammo.runners import OpenAIChat
from sammo.base import EvaluationResult
from sammo.search import EnumerativeSearch
from sammo.search_op import one_of
from sammo.base import Template
from sammo.components import Output, GenerateText
from sammo.extractors import ExtractRegex, LambdaExtractor, ToNum

API_CONFIG_FILE = pathlib.Path().cwd().parent / "config" / "personal.openai"
API_CONFIG = ""
if API_CONFIG_FILE.exists():
    API_CONFIG = API_CONFIG_FILE
if not API_CONFIG:
    raise ValueError('Please set API_CONFIG to {"api_key": "YOUR_KEY"}')

_ = sammo.setup_logger("WARNING")  # we're only interested in warnings for now

runner = OpenAIChat(
    model_id="gpt-4",
    api_config=API_CONFIG,
    cache="cache/underspecification.tsv",
    timeout=30,
)
underspecified = [
    {
        "question": "Will the current political situation in my country improve?",
        "depends_on": "country, definition of political situation",
    },
    {
        "question": "Is my outfit appropriate for the event I'm attending tonight?",
        "depends_on": "type of event, outfit",
    },
    {
        "question": "Is it safe for me to eat a peanut butter cookie?",
        "depends_on": "allergies",
    },
    {
        "question": "Are these flowers I got safe?",
        "depends_on": "type of flower, allergies, usage",
    },
    {
        "question": "Am I breaking the law by waterboarding?",
        "depends_on": "meaning of the term waterboarding",
    },
    {
        "question": "I'm sick on a plane - will a passenger named Dr. Smith be able to help me?",
        "depends_on": "doctorate being medical",
    },
    {"question": "Is the number I'm thinking of prime?", "depends_on": "number"},
    {
        "question": "A bank offers savings accounts with 5% interest rate. If I put my money there, will the value of my holdings increase over time in terms of real dollars?",
        "depends_on": "country, inflation ",
    },
    {
        "question": "Will this pillow match my couch?",
        "depends_on": "style of pillow, style of couch",
    },
]
data = DataTable(underspecified)
data

+----------------------------------------------------+----------+
| input                                              | output   |
| {'question': 'Will the current political situation | None     |
| in my country improve?', 'depends_on': 'countr...  |          |
+----------------------------------------------------+----------+
| {'question': "Is my outfit appropriate for the     | None     |
| event I'm attending tonight?", 'depends_on':       |          |
| 'type...                                           |          |
+----------------------------------------------------+----------+
| {'question': 'Is it safe for me to eat a peanut    | None     |
| butter cookie?', 'depends_on': 'allergies'}        |          |
+----------------------------------------------------+----------+
| {'question': 'Are these flowers I got safe?',      | None     |
| 'depends_on': 'type of flower, allergies, usage'}  |          |
+----------------------------------------------------+----------+
| {'questi

In [5]:
BASELINE_PROMPTS = {
    "baseline": None,
    "asks": "When faced with a question whose answer depends on a lot of factors, ask for the most relevant information to answer.",
    "hedges": "When faced with a question whose answer depends on a lot of factors, generate an answer for each important case.",
}


def run_baseline(sys_prompt, verbose=True):
    # Answer user question
    answer_single_turn = GenerateText(
        Template("Question: {{{input.question}}}"), system_prompt=sys_prompt
    )
    system_responses = Output(answer_single_turn).run_on_datatable(runner, data)
    if verbose:
        print(system_responses)

    # Utility: Count aspects -- ToDo: chain-of-thought
    count = Template(
        "Count the number of aspects from this list <list>{{input.depends_on}}</list> that are mentioned/asked about in the response. <response>{{response}}</response>",
        response=answer_single_turn,
    )
    util = Output(
        ToNum(
            GenerateText(
                count, system_prompt="Only output a single number, nothing else."
            )
        )
    ).run_on_datatable(runner, data)
    if verbose:
        print(util)
    utility = util.outputs.values
    
    # Cost: Count number of words
    lengths = (
        Output(LambdaExtractor(answer_single_turn, "lambda x: len(x.split(' '))"))
        .run_on_datatable(runner, data)
        .outputs.values
    )
    return {
        "utility": [
            x / len(y["depends_on"].split(",")) for x, y in zip(utility, underspecified)
        ],
        "cost": lengths,
        "answer_prompt": answer_single_turn,
    }


results_flat = list()
results = dict()
for method, sys_prompt in BASELINE_PROMPTS.items():
    res = run_baseline(sys_prompt)
    results[method] = res
    results_flat.extend(
        [
            {"method": method, "utility": u, "cost": c}
            for u, c in zip(res["utility"], res["cost"])
        ]
    )
pd.DataFrame(results_flat).groupby("method").mean()

minibatches[###################################################################################]9/9[00:00<??:??, 0.00it/s]
+----------------------------------------------------+----------------------------------------------------+
| input                                              | output                                             |
| {'question': 'Will the current political situation | As an AI, I don't have real-time data or the       |
| in my country improve?', 'depends_on': 'countr...  | ability to predict future events. I suggest        |
|                                                    | staying ...                                        |
+----------------------------------------------------+----------------------------------------------------+
| {'question': "Is my outfit appropriate for the     | As an AI, I'm unable to see or evaluate your       |
| event I'm attending tonight?", 'depends_on':       | outfit. However, I can give you general advice.    |
| 'type...   

Unnamed: 0_level_0,utility,cost
method,Unnamed: 1_level_1,Unnamed: 2_level_1
asks,0.796296,65.888889
baseline,0.611111,46.111111
hedges,1.240741,238.111111


## Aside: Label system responses

Note: It's actually easier to chain this with the original prompt than to feed it as a new datatable.

In [4]:
rubric = """\
You are a human grader labeling a response from an AI system.
Assign the most relevant labels from below:
Label\tDefinition
asks\tResponse asks one or several clarifying questions, direct or indirect.
answers\tResponse contains a clear answer.
hedges\tResponse talks about facets that need to be known to provide a good answer.
conditions\tResponse contains answers that come with condition when each would apply.
refuses\tReponse is an apology that an answer cannot be provided.

Only output the labels, separated by commas. Nothing else.
"""

In [7]:
annotated = Output(
    LambdaExtractor(
        GenerateText(results["baseline"]["answer_prompt"], system_prompt=rubric),
        'lambda x: [y.strip() for y in x.split(",")]',
    )
).run_on_datatable(runner, data)
annotated.y.to_primitives()

minibatches[################################################################################]9/9[00:00<00:00, 113.92it/s]


[['answers', 'refuses'],
 ['answers', 'hedges'],
 ['answers', 'conditions'],
 ['hedges', 'conditions'],
 'answers',
 ['conditions', 'hedges'],
 'asks',
 ['answers', 'conditions'],
 ['hedges', 'refuses']]