# DROP Task

English reading comprehension benchmark requiring Discrete Reasoning Over the content of Paragraphs.


In [None]:
from zeno_client import ZenoClient, ZenoMetric
import datasets
import os
import dotenv

dotenv.load_dotenv(override=True)

In [None]:
client = ZenoClient(os.environ["ZENO_API_KEY"])

Select which models you want to analyze. These paths should represent the name of the repositories when clicking on the 📄 icon next to models on the leaderboard *without* the `details_` start: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard

In [None]:
# falcon-180B, llama2-70B, mistral-7B (are underperforming)
# Yi-34B, tigerbot-70B, possibly internlm-20B (have a good perf)
# facebook/xglm-7.5B falls in the middle
models = ["01-ai__Yi-34B_public", "TigerResearch__tigerbot-70b-chat", "tiiuae__falcon-180B", "mistralai__Mistral-7B-v0.1", "facebook__xglm-7.5B"]

We load the base DROP dataset to get the gold-truth answers. We also do some processing on the inputs.

In [None]:
base_df = datasets.load_dataset("drop")["validation"].to_pandas()

In [None]:
base_df = base_df.drop_duplicates(subset=['query_id'])
base_df['input'] = base_df.apply(lambda x: f"**Passage**: {x['passage']} \n\n**Question:** {x['question']}", axis=1)
base_df['answers'] = base_df.apply(lambda x: ", ".join(x['answers_spans']['spans']), axis=1)
base_df['answer type'] = base_df["answers_spans"].apply(lambda x: x['types'][0])
base_df['passage length'] = base_df['passage'].str.len()
base_df['question length'] = base_df['question'].str.len()

In [None]:
proj = client.create_project(
    name="OpenLLM Leaderboard DROP Comparison", 
    view={
        "data": {
            "type": "markdown"
        },
        "label": {
            "type": "text"
        },
        "output": {
            "type": "text"
        } 
    }, 
    description="Exploring performance differences on DROP for models in OpenLLM Leaderboard.",
    metrics=[
        ZenoMetric(name="f1", type="mean", columns=["f1"]),
        ZenoMetric(name="em", type="mean", columns=["em"]),
        ZenoMetric(name="avg output length", type="mean", columns=["output length"])
    ]
)

In [None]:
proj.upload_dataset(base_df[["query_id", "input", "answers", "passage length", "question length", "answer type"]], id_column="query_id", data_column="input", label_column="answers")

In [None]:
output_df = datasets.load_dataset("open-llm-leaderboard/details_" + models[1], "harness_drop_3")["latest"].to_pandas()

In [None]:
for m in models:
    print("uploading ", m)
    output_df = datasets.load_dataset("open-llm-leaderboard/details_" + m, "harness_drop_3")["latest"].to_pandas()
    merged_df = output_df.merge(base_df, left_on="example", right_on="query_id").drop_duplicates(subset=['query_id'])
    merged_df['output'] = merged_df['predictions'].apply(lambda x: x[0] if len(x) > 0 else '')
    if "metrics" in merged_df.columns:
        merged_df["f1"] = merged_df['metrics'].apply(lambda x: x['f1'])
        merged_df["em"] = merged_df['metrics'].apply(lambda x: x['em'])
    else:
        merged_df["f1"] = merged_df['f1']
        merged_df["em"] = merged_df['em']
    merged_df['output length'] = merged_df['output'].str.len()
    proj.upload_system(merged_df[["query_id", "output", "f1", "em", "output length"]], name=m, id_column="query_id", output_column="output")