## Building an SQUAD2 Eval

This notebook shows how to:
- Build and run an eval
- Load the results and into a Pandas Dataframe

We use the `evals.elsuite.basic.match:Match` Eval class here to check whether new completions match the correct answer. Under the hood, it will generate a completion with the choice of model for each prompt, check if the completion matches the true answer, then logs a result.

In [79]:
import yaml
import os

data_pth = "squad2_extract"

In [80]:
# Install, and download SQUAD if you haven't already
import json
%pip install -e .

!pip install -q wget

import wget

def download_squad(version='2.0', target_dir='.'):
    if version not in ['1.1', '2.0']:
        raise ValueError("SQuAD version must be '1.1' or '2.0'")

    squad_url = {
        '1.1': {
            'train': 'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json',
            'dev': 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json'
        },
        '2.0': {
            'train': 'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json',
            'dev': 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json',
        }
    }

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    for split, url in squad_url[version].items():
        filename = f'squad_{version.replace(".", "_")}_{split}.json'
        file_path = os.path.join(target_dir, filename)
        if not os.path.exists(file_path):
            print(f'Downloading {filename} to {target_dir}...')
            wget.download(url, file_path)
            print(f'{filename} downloaded successfully to {target_dir}.')
        else:
            print(f'{filename} already exists in {target_dir}.')

download_squad('2.0', target_dir=data_pth)


Obtaining file:///Users/jackhopkins/PycharmProjects/evals/examples
[31mERROR: file:///Users/jackhopkins/PycharmProjects/evals/examples does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.
Downloading squad_2_0_train.json to squad2_shuffled...
squad_2_0_train.json downloaded successfully to squad2_shuffled.
Downloading squad_2_0_dev.json to squad2_shuffled...
squad_2_0_dev.json downloaded successfully to squad2_shuffled.


In [81]:
import pandas as pd
import os

# Assuming this notebook is in examples/
registry_pth = os.path.join(os.getcwd(), "../evals/registry")

In [82]:
# Build the prompts using Chat format. We support converting Chat conversations to text for non-Chat models

sys_msg = "The following are questions about {}. Extract the exact answer if it exists, else write 'None'."
def create_chat_prompt(sys_msg, question, paragraph, subject):
    user_prompt = f"Paragraph:{paragraph}\nQuestion:{question}\nAnswer:"
    return [
        {"role": "system", "content": sys_msg.format(subject)}, 
        {"role": "user", "content": user_prompt}
    ]

def create_chat_example(question, paragraph, correct_answer):
    """
    Form few-shot prompts in the recommended format: https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting
    """
    user_prompt = f"Paragraph:{paragraph}\nQuestion:{question}\nAnswer:"
    return [
        {"role": "system", "content": user_prompt, "name": "example_user"},
        {"role": "system", "content": correct_answer, "name": "example_assistant"},
    ]

In [83]:
import json
import os

def squad_json_to_dataframe(file_path):
    with open(file_path, 'r') as file:
        squad_data = json.load(file)

    data = []

    for article in squad_data['data']:
        title = article['title']
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qas in paragraph['qas']:
                question = qas['question']
                if qas['is_impossible']:
                    answer = "None"
                else:
                    answer = qas['answers'][0]['text']
                data.append({'Question': question, 'Paragraph': context, 'Answer': answer, 'Subject': title})

    df = pd.DataFrame(data, columns=['Question', 'Paragraph', 'Answer', 'Subject'])
    return df

subjects = sorted([f for f in os.listdir(data_pth)])

registry_yaml = {}
samples_pth, few_shot_pth = '', ''
for subject in subjects:

    subject_pth = os.path.join(registry_pth, "data", data_pth)

    os.makedirs(subject_pth, exist_ok=True)
    dataset = squad_json_to_dataframe(os.path.join(data_pth, subject))
    print(subject)
    if subject.endswith('_dev.json'):
         # Create few-shot prompts
        dataset["sample"] = dataset.apply(lambda x: create_chat_example(x["Question"], x["Paragraph"], x['Answer']), axis=1)
        few_shot_pth = os.path.join(subject_pth, "few_shot.jsonl")
        dataset[["sample"]].sample(frac=1).head(400).to_json(few_shot_pth, lines=True, orient="records")
        eval_id = f"match_{data_pth}_dev"

    else:
        # Create test prompts and ideal completions
        dataset["input"] = dataset.apply(lambda x: create_chat_prompt(sys_msg, x["Question"], x['Paragraph'], x['Subject']), axis=1)
        dataset["ideal"] = dataset.Answer
        samples_pth = os.path.join(subject_pth, "samples.jsonl")
        dataset[["input", "ideal"]].sample(frac=1).head(400).to_json(samples_pth, lines=True, orient="records")
        eval_id = f"match_{data_pth}_train"

    registry_yaml[eval_id] = {
        "id": f"{eval_id}.test.v1",
        "metrics": ["accuracy"]
    }
    registry_yaml[f"{eval_id}.test.v1"] = {
        "class": "evals.elsuite.basic.match:Match",
        "args": {
            "samples_jsonl": samples_pth,
            "few_shot_jsonl": few_shot_pth,
            "num_few_shot": 4,
        }
    }

with open(os.path.join(registry_pth, "evals", f"{data_pth}.yaml"), "w") as f:
    yaml.dump(registry_yaml, f)

squad_2_0_dev.json
squad_2_0_train.json


In [84]:
# This will generate a JSONL which will record samples and logs and store it in /tmp/evallogs
!oaieval gpt-3.5-turbo match_squad2

Traceback (most recent call last):
  File "/Users/jackhopkins/miniforge3/bin/oaieval", line 5, in <module>
    from evals.cli.oaieval import main
  File "/Users/jackhopkins/PycharmProjects/evals/evals/__init__.py", line 1, in <module>
    from .api import check_sampled_text, completion_query, sample_freeform
  File "/Users/jackhopkins/PycharmProjects/evals/evals/api.py", line 18, in <module>
    from evals.utils.api_utils import (
  File "/Users/jackhopkins/PycharmProjects/evals/evals/utils/api_utils.py", line 17, in <module>
    openai.error.Timeout,
AttributeError: module 'openai.error' has no attribute 'Timeout'


In [85]:
# How to process the log events generated by oaieval
events = "/tmp/evallogs/{log_name}"

with open(events, "r") as f:
    events_df = pd.read_json(f, lines=True)

matches_df = events_df[events_df.type == "match"].reset_index(drop=True)
matches_df = matches_df.join(pd.json_normalize(matches_df.data))
matches_df.correct.value_counts().plot.bar(title="Correctness of generated answers", xlabel="Correctness", ylabel="Count")

FileNotFoundError: [Errno 2] No such file or directory: '/tmp/evallogs/{log_name}'

In [None]:
# Inspect samples
for i, r in pd.json_normalize(events_df[events_df.type == "sampling"].data).iterrows():
    print(f"Prompt: {r.prompt}")
    print(f"Sampled: {r.sampled}")
    print("-" * 25)