In [None]:
from zeno_client import ZenoClient, ZenoMetric
import pandas as pd
import os
import dotenv

dotenv.load_dotenv("../.env", override=True)

In [None]:
OUTPUT_DIR = "../../outputs/gsm8k"

In [None]:
models = os.listdir(OUTPUT_DIR)

In [None]:
# Upload base dataset
df = pd.read_json(os.path.join(OUTPUT_DIR, models[0], "output.jsonl"), lines=True)
base_df = pd.DataFrame(
    {
        "qid": df["qid"],
        "question": df["question"],
        "answer": df.apply(
            lambda x: x["answer_text"].split("<|endoftext|>")[0], axis=1
        ),
    }
)

In [None]:
zeno_client = ZenoClient(os.environ.get("ZENO_API_KEY"))

In [None]:
project = zeno_client.create_project(
    name="Gemini Evaluation - GSM8k",
    description="Evaluation of Gemini, GPT-4, and Mixtral on GSM8k dataset",
    view={
        "data": {"type": "markdown"},
        "label": {"type": "markdown"},
        "output": {"type": "markdown"},
    },
    public=True,
    metrics=[
        ZenoMetric(name="Accuracy", type="mean", columns=["is_correct"]),
    ],
)

In [None]:
project.upload_dataset(
    base_df, id_column="qid", data_column="question", label_column="answer"
)

In [None]:
for model in models:
    df = pd.read_json(os.path.join(OUTPUT_DIR, model, "output.jsonl"), lines=True)
    output_df = pd.DataFrame(
        {
            "qid": df["qid"],
            "output": df.apply(
                lambda x: f"{x['generated_text']}\n\n**{x['predict']}**", axis=1
            ),
            "is_correct": df["is_correct"].astype(bool),
        }
    )
    project.upload_system(
        output_df, name=model, id_column="qid", output_column="output"
    )