In [None]:
from zeno_client import ZenoClient, ZenoMetric
import pandas as pd
import json
import os
from dotenv import load_dotenv

load_dotenv(override=True)

In [None]:
with open("document-qa-results.json", "r") as f:
    data = json.load(f)
data_df = pd.DataFrame({"question": [d["data"] for d in data]})
data_df["id"] = data_df.index

In [None]:
client = ZenoClient(os.environ["ZENO_API_KEY"])

In [None]:
project = client.create_project(
    name="Document QA",
    view={
        "data": {"type": "text"},
        "label": {"type": "text"},
        "output": {
            "type": "vstack",
            "keys": {
                "answer": {"type": "text"},
                "retrieved": {
                    "type": "list",
                    "elements": {
                        "type": "vstack",
                        "keys": {
                            "score": {"type": "text", "label": "score: "},
                            "reference": {"type": "markdown"},
                            "text": {"type": "text", "label": "text: "},
                        },
                    },
                    "collapsible": "bottom",
                    "border": True,
                    "pad": True,
                },
            },
        },
    },
    description="Document-grounded question answering with Wikipedia",
    metrics=[
        ZenoMetric(name="accuracy", type="mean", columns=["accuracy"]),
        ZenoMetric(name="exact_match", type="mean", columns=["exact_match"]),
        ZenoMetric(name="substring_match", type="mean", columns=["substring_match"]),
        ZenoMetric(name="f1", type="mean", columns=["f1"]),
        ZenoMetric(name="rougel", type="mean", columns=["rougel"]),
    ],
)

In [None]:
project.upload_dataset(data_df, id_column="id", data_column="question")

In [None]:
output_df = pd.DataFrame(
    {
        "output": [
            json.dumps(
                {
                    "answer": d["output"][0]["answer"],
                    "retrieved": [
                        {
                            "reference": "[{idx}]({url})".format(
                                idx=d["output"][0]["retrieved"][0]["reference"],
                                url="https://en.wikipedia.org/?curid="
                                + d["output"][0]["retrieved"][0]["reference"],
                            ),
                            "text": d["output"][0]["retrieved"][0]["text"],
                            "score": d["output"][0]["retrieved"][0]["score"],
                        }
                    ],
                }
            )
            for d in data
        ],
        "accuracy": [d["output"][0]["answer_evaluation"]["accuracy"] for d in data],
        "exact_match": [
            d["output"][0]["answer_evaluation"]["exact_match"] for d in data
        ],
        "substring_match": [
            d["output"][0]["answer_evaluation"]["substring_match"] for d in data
        ],
        "f1": [d["output"][0]["answer_evaluation"]["f1"] for d in data],
        "rougel": [d["output"][0]["answer_evaluation"]["rougel"] for d in data],
    }
)
output_df["id"] = output_df.index

In [None]:
project.upload_system(
    output_df, name="Llama-2 BM25", id_column="id", output_column="output"
)