# DROP Task

English reading comprehension benchmark requiring Discrete Reasoning Over the content of Paragraphs.


In [None]:
from zeno_client import ZenoClient, ZenoMetric
import datasets
import os
import dotenv

dotenv.load_dotenv(override=True)

In [None]:
client = ZenoClient(os.environ["ZENO_API_KEY"])

Select which models you want to analyze. These paths should represent the name of the repositories when clicking on the 📄 icon next to models on the leaderboard *without* the `details_` start: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard

In [None]:
# falcon-180B, llama2-70B, mistral-7B (are underperforming)
# Yi-34B, tigerbot-70B, possibly internlm-20B (have a good perf)
# facebook/xglm-7.5B falls in the middle
models = ["01-ai__Yi-34B_public", "TigerResearch__tigerbot-70b-chat", "tiiuae__falcon-180B", "mistralai__Mistral-7B-v0.1", "facebook__xglm-7.5B"]

We load the base DROP dataset to get the gold-truth answers. We also do some processing on the inputs.

In [None]:
base_df = datasets.load_dataset("drop")["validation"].to_pandas()

In [None]:
base_df = base_df.drop_duplicates(subset=['query_id'])
base_df['input'] = base_df.apply(lambda x: f"**Passage**: {x['passage']} \n\n**Question:** {x['question']}", axis=1)
base_df['answers'] = base_df.apply(lambda x: ", ".join(x['answers_spans']['spans']), axis=1)
base_df['answer type'] = base_df["answers_spans"].apply(lambda x: x['types'][0])
base_df['passage length'] = base_df['passage'].str.len()
base_df['question length'] = base_df['question'].str.len()

In [None]:
proj = client.create_project(
    name="OpenLLM Leaderboard DROP Comparison", 
    view={
        "data": {
            "type": "markdown"
        },
        "label": {
            "type": "text"
        },
        "output": {
            "type": "text"
        } 
    }, 
    description="Exploring performance differences on DROP for models in OpenLLM Leaderboard.",
    metrics=[
        ZenoMetric(name="f1", type="mean", columns=["f1"]),
        ZenoMetric(name="em", type="mean", columns=["em"]),
        ZenoMetric(name="avg output length", type="mean", columns=["output length"])
    ]
)

In [None]:
proj.upload_dataset(base_df[["query_id", "input", "answers", "passage length", "question length", "answer type"]], id_column="query_id", data_column="input", label_column="answers")

In [None]:
for m in models:
    print("uploading ", m)
    output_df = datasets.load_dataset("open-llm-leaderboard/details_" + m, "harness_drop_3")["latest"].to_pandas()
    merged_df = output_df.merge(base_df, left_on="example", right_on="query_id").drop_duplicates(subset=['query_id'])
    merged_df['output'] = merged_df['predictions'].apply(lambda x: x[0] if len(x) > 0 else '')
    if "metrics" in merged_df.columns:
        merged_df["f1"] = merged_df['metrics'].apply(lambda x: x['f1'])
        merged_df["em"] = merged_df['metrics'].apply(lambda x: x['em'])
    merged_df['output length'] = merged_df['output'].str.len()
    proj.upload_system(merged_df[["query_id", "output", "f1", "em", "output length"]], name=m, id_column="query_id", output_column="output")

Let's load Falcon specifically and modify the output to remove everything after the first newline.

We first load the DROP implementation functions from the harness to re-calculate the F1 score.

In [None]:
import re
import string

import numpy as np
from scipy.optimize import linear_sum_assignment

_ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE)
def get_metrics(predicted, gold):
    """
    Takes a predicted answer and a gold answer (that are both either a string or a list of
    strings), and returns exact match and the DROP F1 metric for the prediction.  If you are
    writing a script for evaluating objects in memory (say, the output of predictions during
    validation, or while training), this is the function you want to call, after using
    :func:`answer_json_to_strings` when reading the gold answer from the released data file.
    """
    predicted_bags = _answer_to_bags(predicted)
    gold_bags = _answer_to_bags(gold)

    if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(
        gold_bags[0]
    ):
        exact_match = 1.0
    else:
        exact_match = 0.0

    f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
    f1 = np.mean(f1_per_bag)
    f1 = round(f1, 2)
    return exact_match, f1


def _answer_to_bags(answer):
    if isinstance(answer, (list, tuple)):
        raw_spans = answer
    else:
        raw_spans = [answer]
    normalized_spans = []
    token_bags = []
    for raw_span in raw_spans:
        normalized_span = _normalize(raw_span)
        normalized_spans.append(normalized_span)
        token_bags.append(set(normalized_span.split()))
    return normalized_spans, token_bags


def _align_bags(predicted, gold):
    """
    Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
    between them and gets maximum metric values over all the answers.
    """
    scores = np.zeros([len(gold), len(predicted)])
    for gold_index, gold_item in enumerate(gold):
        for pred_index, pred_item in enumerate(predicted):
            if _match_numbers_if_present(gold_item, pred_item):
                scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item)
    row_ind, col_ind = linear_sum_assignment(-scores)

    max_scores = np.zeros([max(len(gold), len(predicted))])
    for row, column in zip(row_ind, col_ind):
        max_scores[row] = max(max_scores[row], scores[row, column])
    return max_scores


def _compute_f1(predicted_bag, gold_bag):
    intersection = len(gold_bag.intersection(predicted_bag))
    if not predicted_bag:
        precision = 1.0
    else:
        precision = intersection / float(len(predicted_bag))
    if not gold_bag:
        recall = 1.0
    else:
        recall = intersection / float(len(gold_bag))
    f1 = (
        (2 * precision * recall) / (precision + recall)
        if not (precision == 0.0 and recall == 0.0)
        else 0.0
    )
    return f1


def _match_numbers_if_present(gold_bag, predicted_bag):
    gold_numbers = set()
    predicted_numbers = set()
    for word in gold_bag:
        if _is_number(word):
            gold_numbers.add(word)
    for word in predicted_bag:
        if _is_number(word):
            predicted_numbers.add(word)
    if (not gold_numbers) or gold_numbers.intersection(predicted_numbers):
        return True
    return False


def _is_number(text):
    try:
        float(text)
        return True
    except ValueError:
        return False


def _remove_articles(text):
    return _ARTICLES.sub(" ", text)


def _white_space_fix(text):
    return " ".join(text.split())


def _remove_punc(text):
    exclude = set(string.punctuation)
    if not _is_number(text):
        return "".join(ch for ch in text if ch not in exclude)
    else:
        return text


def _fix_number(text):
    return str(float(text)) if _is_number(text) else text


def _tokenize(text):
    return re.split(" |-", text)


def _normalize(answer):
    tokens = [
        _white_space_fix(_remove_articles(_fix_number(_remove_punc(token.lower()))))
        for token in _tokenize(answer)
    ]
    tokens = [token for token in tokens if token.strip()]
    normalized = " ".join(tokens).strip()
    return normalized

In [None]:
output_df = datasets.load_dataset("open-llm-leaderboard/details_" + models[2], "harness_drop_3")["latest"].to_pandas()

In [None]:
output_df['output'] = output_df['predictions'].apply(lambda x: x[0].split("\n")[0] if len(x) > 0 else '')

In [None]:
# iterrows over output_df and set f1 to the result of get_metrics
for index, row in output_df.iterrows():
    m = get_metrics([row['output']], [x[0] for x in row['answers']])
    output_df.loc[index, 'em'] = m[0]
    output_df.loc[index, 'f1'] = m[1]

In [None]:
merged_df = output_df.merge(base_df, left_on="example", right_on="query_id").drop_duplicates(subset=['query_id'])
merged_df['output length'] = merged_df['output'].str.len()
proj.upload_system(merged_df[["query_id", "output", "f1", "em", "output length"]], name="Falcon normalized", id_column="query_id", output_column="output")