To run this script, download the [MOCHA.tar.gz](https://github.com/anthonywchen/MOCHA/blob/main/data/mocha.tar.gz) and extract it to `data/mocha`. This script will use the following frameworks:
- `SPACY` for tokenization;
- HuggingFace `datasets` for evaluation.

In [1]:
# Data loading
import json

# Tokenization
import spacy

# Evaluation
import datasets

In [2]:
# Data directory: where to look for a model
DATA_DIR = "../data/mocha/"

SPLIT = "dev"
# Full filepath to load
FILEPATH = f"{DATA_DIR}/dev.json"

# ouput directory
OUTPUT_DIR = "../outputs/proxy_metrics_20220426"

import os
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Evaluation tokenizer
EVALUATION_TOKENIZER = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [3]:
data = json.load(open(FILEPATH))
print("Number of examples:", len(data))
next(iter(data.items()))

Number of examples: 6


('cosmosqa',
 {'00484c61d760781cc315e45da887e138': {'candidate': 'I want to help Luke feed.',
   'context': "There is one area I want to work on . Breast - feeding . Right now , Luke's addicted to the bottle . We were so eager to wean him off his nose tube that when he started taking a bottle , we made it our only goal to re - enforce that .",
   'metadata': {'scores': [1, 1, 1], 'source': 'gpt2'},
   'question': 'What may be your reason for wanting to work on Breast - feeding ?',
   'reference': 'It could help my son .',
   'score': 1},
  '008fcce00cb5455d4449b29e516bafdc': {'candidate': 'the Olympics are watching',
   'context': "The strangest thing that has happened was when they were singing the Chinese National Anthem she was standing in front of the TV swaying and singing ... the words weren't really the words but it was kind of freaky ! I asked her is she knew the song and she said yes : ) She also is screamed fireworks a lot ! ! ! She did enjoy naming everyone she knows who is 

### Token overlap metrics

Current QA evaluation relies on string matching or token overlap metrics.

In [4]:
BLEU = datasets.load_metric("bleu", keep_in_memory=True)
BLEURT = datasets.load_metric("bleurt", keep_in_memory=True)
#^Note: requires installing bleurt !pip install git+https://github.com/google-research/bleurt.git
ROUGE = datasets.load_metric("rouge", keep_in_memory=True) 
#^Note: requires installing rouge-score (!pip install rouge-score)

METEOR = datasets.load_metric("meteor", keep_in_memory=True)
EXACT_MATCH = datasets.load_metric("exact_match", keep_in_memory=True)

BERT_SCORE = datasets.load_metric("bertscore", keep_in_memory=True)
#^Note: requires installing bert-score: https://pypi.org/project/bert-score/

EDIT_RATIO = datasets.load_metric("ter", keep_in_memory=True)
#^Note: requires installing sacrebleu
# pip install sacrebleu sacrebleu

from collections import Counter
from typing import Dict, List, Union
import logging


Tokens = List[str]
Text = Union[str, Tokens]


def exact_match(y_true: Text, y_pred: Text) -> int:
    """Determine whether two texts (or sequences of tokens) are equal."""
    if isinstance(y_true, str) and isinstance(y_pred, str):
        return int(y_true == y_pred)

    elif isinstance(y_true, (list, tuple)) and isinstance(y_pred, (list, tuple)):
        if len(y_true) != len(y_pred):
            logging.debug(
                f"Dimension mismatch (default value is 0): {y_true} vs {y_pred}"
            )
            return 0
        return int(all(map(lambda t1, t2: t1 == t2, y_true, y_pred)))
    else:
        error_msg = f"y_true ({type(y_true)}) and y_pred ({type(y_pred)})"
        raise ValueError(
            f"Cannot compare `exact_match` for argument types: {error_msg}"
        )


def first_error_position(y_true: Tokens, y_pred: Tokens, no_err_val: int = None) -> int:
    """Determine the position in the predicted sequence of the first error.
    Notes
    -----
    If both text sequences are equivalent we return ``no_err_val`` as the position.
    Otherwise, we iterate for each token in ``y_pred`` and look for the first
    mismatch between ``y_pred`` and ``y_true`` tokens returning that position.
    Examples
    --------
    >>> y_true = ["The", "sky", "is", "blue"]
    >>> y_pred = ["A", "sky", "is", "blue"]
    >>> first_error_position(y_true, y_pred)
    1
    >>> y_pred = ["The", "sky", "IS", "blue"]
    >>> first_error_position(y_true, y_pred)
    3
    >>> first_error_position(y_true, y_true, no_err_val=-1)
    -1
    """
    assert isinstance(y_true, (list, tuple)) and len(y_true) != 0
    assert isinstance(y_pred, (list, tuple)) and len(y_pred) != 0

    # When no error occurs return the `no_err_val`
    if exact_match(y_true, y_pred):
        return no_err_val

    # If there are differences then they are one of two types:
    # 1. Token mismatch: which will occur in the common length of
    # the two sequences. Values can vary between 0 and min(lengths)
    # 2. Misnumber of tokens: one of the sequences is longer than the
    # other, causing them to be wrong.
    max_mismatch_ix = min(len(y_true), len(y_pred))

    for i in range(max_mismatch_ix):
        if y_true[i] != y_pred[i]:
            return i
    return max_mismatch_ix


def _precision(tp, fp, tn, fn) -> float:
    return 0 if tp == 0 else tp / (tp + fp)


def _recall(tp, fp, tn, fn) -> float:
    return 0 if tp == 0 else tp / (tp + fn)


def _critical_success_index(tp, fp, tn, fn):
    "Ratio of positives w.r.t. number of errors (also dubbed threat score)."
    return 0 if tp == 0 else tp / (tp + fn + fp)


def _f1_score(precision=None, recall=None, **kwargs) -> float:
    if precision is not None and recall is not None:
        p = precision
        r = recall
        # return if precision or recall are 0
        if p == 0 or r == 0:
            return 0
    else:
        p = _precision(**kwargs)
        r = _recall(**kwargs)

    return (2 * p * r) / (p + r)


def f_metrics(references: Tokens, predictions: Tokens) -> Dict[str, float]:
    true_tokens, pred_tokens = Counter(references), Counter(predictions)
    tp = sum((true_tokens & pred_tokens).values())
    fp = len(predictions) - tp
    fn = len(references) - tp
    tn = 0
    assert tp + fp + fn == sum((true_tokens | pred_tokens).values())

    prec = _precision(tp=tp, fp=fp, tn=tn, fn=fn)
    rec = _recall(tp=tp, fp=fp, tn=tn, fn=fn)
    return {
        "precision": prec,
        "recall": rec,
        "f1_score": _f1_score(precision=prec, recall=rec),
        "csi": _critical_success_index(tp=tp, fp=fp, tn=tn, fn=fn),
    }

Using default BLEURT-Base checkpoint for sequence maximum length 128. You can use a bigger model for better results with e.g.: datasets.load_metric('bleurt', 'bleurt-large-512').


INFO:tensorflow:Reading checkpoint /home/kat/.cache/huggingface/metrics/bleurt/default/downloads/extracted/6aeb5b0da7794f61ad09cfb7badbbe4274f037d56edf549b3331c1732281ea88/bleurt-base-128.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint bert_custom
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:bert_custom
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:128
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.
INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.


2022-04-26 06:59:37.836001: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-26 06:59:37.836534: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2022-04-26 06:59:37.836575: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2022-04-26 06:59:37.836608: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcufft.so.10'; dlerror: libcufft.so.10: cannot open shared object file: No such file or directory
2022-04-26 06:59:37.836640: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could no

INFO:tensorflow:BLEURT initialized.


[nltk_data] Downloading package wordnet to /home/kat/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/kat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/kat/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Couldn't find a directory or a metric named 'exact_match' in this version. It was picked from the master branch on github instead.


In [8]:
def tokenize(texts, tokens=True, sep: str = " ", tokenizer=EVALUATION_TOKENIZER) -> list:
    arg_is_str = isinstance(texts, str)
    if arg_is_str:
        texts = [texts]

    results = []
    for text in texts:
        text = text.strip()
        text_tokens = tokenizer(text)
        text_tokens = [str(t) for t in text_tokens]
        results.append(text_tokens if tokens else [sep.join(text_tokens)])

    return results[0] if arg_is_str else results


def remove_punc(s):
    return s.replace('?', '').replace('.', '').replace('!', '')


def normalize_answer(s: str) -> str:
    """Lower text and remove punctuation, articles and extra whitespace."""
    import re, string

    def remove_articles(text):
        return re.sub(r"\b(a|an|the)\b", " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def get_examples(data):
    references = {example_id: (example["reference"]) for example_id, example in data.items()}
    candidates = {example_id: (example["candidate"]) for example_id, example in data.items()}
    human_judgement = {example_id: (example["score"]) for example_id, example in data.items()}
    
    return references, candidates, human_judgement


def apply(d: dict, fn):
    return {k: fn(elem) for k, elem in d.items()}


In [9]:
def compute_rouge(**kwargs):
    results = ROUGE.compute(**kwargs)
    return {rouge_type: rouge.mid.fmeasure for rouge_type, rouge in results.items()}

def compute_bleu(max_order=4, **kwargs):
    bleu = {}
    results = {"precisions": [0 for i in range(max_order)]}
    for i in range(1, max_order+1):
        try: 
            results = BLEU.compute(**kwargs, max_order=i)
            bleu[f"bleu{i}"] = results["bleu"]
            
        except:
            print("Error (returning 0):", kwargs)
            bleu[f"bleu{i}"] = 0
     
    for i, p in enumerate(results["precisions"]):
        bleu[f"bleu-precision{i}"] = p
    
    return bleu

def compute_edit_score(**kwargs):
    results = EDIT_RATIO.compute(**kwargs)
    # Note: {'score': 133.33333333333331, 'num_edits': 4, 'ref_length': 3.0}
    results["edit_score"] = results.pop("score") / 100
    results.pop("ref_length")
    return results

In [12]:
def compute_metrics(data):
    # Separate data in references and candidates
    references, candidates, human_judgements = get_examples(data)
    assert len(references) == len(candidates)

    ## Compute metrics that rely on strings ()
    # references_norm = apply(references, fn=normalize_answer)
    # candidates_norm = apply(candidates, fn=normalize_answer)
    ## ^NOTE: This was causing some inconsistency in the results since
    ## there were a few examples where the candidate answer is simply
    ## "the" or "an" --> therefore becoming empty str
    references_norm = apply(references, fn=remove_punc)
    candidates_norm = apply(candidates, fn=remove_punc)
    
    # Tokenize data
    references_tokens = apply(references_norm, fn=tokenize)
    candidates_tokens = apply(candidates_norm, fn=tokenize)

    metric_results = []
    for example_id, correctness in human_judgements.items():
        reference = references_norm[example_id]
        candidate = candidates_norm[example_id]

        reference_tks = references_tokens[example_id]
        candidate_tks = candidates_tokens[example_id]    

        metrics = {
            "example_id": example_id,

            "reference": reference,
            "candidate": candidate,

            "reference_tokens": reference_tks,
            "candidate_tokens": candidate_tks,

            "human_correctness_original": correctness,
            "human_correctness": (correctness - 1) / (5-1),
        }

        text_args = {"predictions": [candidate], "references": [reference]}
        metrics.update(EXACT_MATCH.compute(**text_args))
        metrics.update(METEOR.compute(**text_args))
        metrics.update(compute_rouge(**text_args))
        metrics.update({"bleurt": BLEURT.compute(**text_args)["scores"][0]})
        metrics.update({"bert-score": BERT_SCORE.compute(**text_args, lang="en")["f1"][0]})

        # keyword arguments for BLEU-like metrics
        if not candidate:
            print(data[example_id], metrics)
            
        token_args_w_mult_refs_args = {"predictions": [candidate_tks], "references": [[reference_tks]]}
        metrics.update(compute_bleu(**token_args_w_mult_refs_args))

        token_args = {"predictions": candidate_tks, "references": reference_tks}
        metrics.update(f_metrics(**token_args))


        text_w_mult_refs_args = {"predictions": [candidate], "references": [[reference]]}
        # TER score (num_edits / sum_ref_lengths * 100)
        metrics.update(compute_edit_score(**text_w_mult_refs_args))

        metric_results.append(metrics)

    return metric_results

# sanity check (:
d = compute_metrics(data["drop"])

AttributeError: 'list' object has no attribute 'head'

In [14]:
import pandas as pd 


def write_predictions(metrics, split, dataset=None, output_dir=OUTPUT_DIR):
    output_file = f"{output_dir}/{split}_{dataset}_metrics.csv.gz" 
    print("Writing metrics at", output_file)
    metrics.to_csv(output_file, compression="gzip")
    
    
def persist_metrics(split, dataset=None, input_dir=DATA_DIR, output_dir=OUTPUT_DIR):
    _filepath = f"{input_dir}/{split}.json"

    data = json.load(open(_filepath))
    
    if isinstance(dataset, str):
        dataset = [dataset]

    data = {k: v for k, v in data.items() if (dataset is None) or (k in dataset)}
    print("Number of datasets:", len(data))

    all_metrics = []

    for dataset_name, dataset in data.items():
        print("Computing metrics", len(dataset),"examples of dataset", dataset_name)
        metrics = compute_metrics(dataset)
        metrics = pd.DataFrame(metrics)

        metrics["dataset"] = dataset_name
        metrics["split"] = split
        
        write_predictions(metrics, split=split, dataset=dataset_name, output_dir=output_dir)
        all_metrics.append(metrics)
    
    all_metrics = pd.concat(all_metrics).reset_index(drop=True)
    
    if len(data) > 1:
        write_predictions(all_metrics, split=split, dataset="all_datasets", output_dir=output_dir)
        
    return all_metrics

In [15]:
%%time
dev_metrics = persist_metrics("dev")
dev_metrics.describe()

Number of datasets: 6
Computing metrics 683 examples of dataset cosmosqa
Writing metrics at ../outputs/proxy_metrics_20220426/dev_cosmosqa_metrics.csv.gz
Computing metrics 97 examples of dataset drop
Writing metrics at ../outputs/proxy_metrics_20220426/dev_drop_metrics.csv.gz
Computing metrics 978 examples of dataset mcscript
Writing metrics at ../outputs/proxy_metrics_20220426/dev_mcscript_metrics.csv.gz
Computing metrics 890 examples of dataset narrativeqa




{'candidate': '.', 'context': 'The story centres on the relationship between Mrs Kitty Warren and her daughter, Vivie. Mrs. Warren, a former prostitute and current brothel owner, is described as "on the whole, a genial and fairly presentable old blackguard of a woman." Vivie, an intelligent and pragmatic young woman who has just graduated from university, has come home to get acquainted with her mother for the first time in her life. The play focuses on how their relationship changes when Vivie learns what her mother does for a living. It explains why Mrs. Warren became a prostitute, condemns the hypocrisies relating to prostitution, and criticises the limited employment opportunities available for women in Victorian Britain.Vivie Warren, a thoroughly modern young woman, has just graduated from the University of Cambridge with honours in Mathematics (equal Third Wrangler), and is available for suitors. Her mother, Mrs. Warren (her name changed to hide her identity and give the impressi



{'candidate': '.', 'context': 'The film is a semi-biographical story based upon the early life and rise to prominence of Native American dog musher George Attla, Jr. (1933–2015).  Attla, known as "the Huslia Hustler," took his nickname from one of his mentors, Jimmy Huntington, who first began winning races during Attla\'s childhood.  In recent generations, this nickname has become associated with Attla far more than with Huntington.  Attla was a leading star of the 1960s and 1970s in the sport of sprint dog sled racing.  He won the Fur Rendezvous World Championship race, held in Anchorage, Alaska, 10 times between 1958 and 1982.  He also won 8 championships in the Open North American Championship race, held in Fairbanks, Alaska.  In addition, despite his mushing experience being geared more towards sprint than distance racing, Attla competed in the inaugural Iditarod Trail Sled Dog Race in 1973, placing fifth.  In a 2011 interview, Attla spoke of the popularity of the Iditarod, and ho

Unnamed: 0,human_correctness_original,human_correctness,exact_match,meteor,rouge1,rouge2,rougeL,rougeLsum,bleurt,bert-score,...,bleu-precision0,bleu-precision1,bleu-precision2,bleu-precision3,precision,recall,f1_score,csi,num_edits,edit_score
count,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,...,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0
mean,2.357238,0.339309,0.024944,0.247296,0.287694,0.107104,0.281029,0.281029,-0.688747,0.882008,...,0.258614,0.093471,0.042008,0.016961,0.258614,0.291364,0.245709,0.16909,4.633076,1.593062
std,1.498841,0.37471,1.579363,0.270958,0.274079,0.209965,0.270506,0.270506,0.762111,0.054886,...,0.291076,0.202191,0.145938,0.089283,0.291076,0.314313,0.258518,0.199534,4.1684,2.703827
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.616384,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.290687,0.847648,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.666667
50%,1.666667,0.166667,0.0,0.16129,0.25,0.0,0.230769,0.230769,-0.831038,0.873768,...,0.166667,0.0,0.0,0.0,0.166667,0.2,0.181818,0.1,3.0,1.0
75%,3.666667,0.666667,0.0,0.387427,0.5,0.111111,0.5,0.5,-0.1511,0.910523,...,0.5,0.0,0.0,0.0,0.5,0.5,0.444444,0.285714,6.0,1.5
max,5.0,1.0,100.0,0.999624,1.0,1.0,1.0,1.0,1.200342,1.000011,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,79.0,79.0


In [16]:
pd.read_csv(f"{OUTPUT_DIR}/dev_all_datasets_metrics.csv.gz", index_col=0).describe()

Unnamed: 0,human_correctness_original,human_correctness,exact_match,meteor,rouge1,rouge2,rougeL,rougeLsum,bleurt,bert-score,...,bleu-precision0,bleu-precision1,bleu-precision2,bleu-precision3,precision,recall,f1_score,csi,num_edits,edit_score
count,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,...,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0
mean,2.357238,0.339309,0.024944,0.247296,0.287694,0.107104,0.281029,0.281029,-0.688747,0.882008,...,0.258614,0.093471,0.042008,0.016961,0.258614,0.291364,0.245709,0.16909,4.633076,1.593062
std,1.498841,0.37471,1.579363,0.270958,0.274079,0.209965,0.270506,0.270506,0.762111,0.054886,...,0.291076,0.202191,0.145938,0.089283,0.291076,0.314313,0.258518,0.199534,4.1684,2.703827
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.616384,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.290687,0.847648,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.666667
50%,1.666667,0.166667,0.0,0.16129,0.25,0.0,0.230769,0.230769,-0.831038,0.873768,...,0.166667,0.0,0.0,0.0,0.166667,0.2,0.181818,0.1,3.0,1.0
75%,3.666667,0.666667,0.0,0.387427,0.5,0.111111,0.5,0.5,-0.1511,0.910523,...,0.5,0.0,0.0,0.0,0.5,0.5,0.444444,0.285714,6.0,1.5
max,5.0,1.0,100.0,0.999624,1.0,1.0,1.0,1.0,1.200342,1.000011,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,79.0,79.0


In [17]:
pd.read_csv(f"{OUTPUT_DIR}/dev_all_datasets_metrics.csv.gz", index_col=0).describe()

Unnamed: 0,human_correctness_original,human_correctness,exact_match,meteor,rouge1,rouge2,rougeL,rougeLsum,bleurt,bert-score,...,bleu-precision0,bleu-precision1,bleu-precision2,bleu-precision3,precision,recall,f1_score,csi,num_edits,edit_score
count,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,...,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0,4009.0
mean,2.357238,0.339309,0.024944,0.247296,0.287694,0.107104,0.281029,0.281029,-0.688747,0.882008,...,0.258614,0.093471,0.042008,0.016961,0.258614,0.291364,0.245709,0.16909,4.633076,1.593062
std,1.498841,0.37471,1.579363,0.270958,0.274079,0.209965,0.270506,0.270506,0.762111,0.054886,...,0.291076,0.202191,0.145938,0.089283,0.291076,0.314313,0.258518,0.199534,4.1684,2.703827
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.616384,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.290687,0.847648,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.666667
50%,1.666667,0.166667,0.0,0.16129,0.25,0.0,0.230769,0.230769,-0.831038,0.873768,...,0.166667,0.0,0.0,0.0,0.166667,0.2,0.181818,0.1,3.0,1.0
75%,3.666667,0.666667,0.0,0.387427,0.5,0.111111,0.5,0.5,-0.1511,0.910523,...,0.5,0.0,0.0,0.0,0.5,0.5,0.444444,0.285714,6.0,1.5
max,5.0,1.0,100.0,0.999624,1.0,1.0,1.0,1.0,1.200342,1.000011,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,79.0,79.0


In [None]:
%%time
train_metrics = persist_metrics("train")
train_metrics.describe()

Number of datasets: 6
Computing metrics 5033 examples of dataset cosmosqa
Writing metrics at ../outputs/proxy_metrics_20220426/train_cosmosqa_metrics.csv.gz
Computing metrics 687 examples of dataset drop
Writing metrics at ../outputs/proxy_metrics_20220426/train_drop_metrics.csv.gz
Computing metrics 7210 examples of dataset mcscript
Writing metrics at ../outputs/proxy_metrics_20220426/train_mcscript_metrics.csv.gz
Computing metrics 7471 examples of dataset narrativeqa




{'candidate': '.', 'context': "Chris Hughes (Davis), an adopted and geeky Ohio high school senior, discovers that his recently deceased birth parents are the proprietors of a vast pornography empire and he is the inherited heir. Dropped into a bitter power struggle, his new flock of beautiful co-workers come to his aid. Chris Hughes was an outsider and geek in Ohio. He is in the middle of his senior year at high school and he is 18 years old. Chris earns extra money working on a paper route riding a moped. Right now, he would do anything to get out of here. Mostly Chris is raised by his religious parents, Carl Hughes (Dave Thomas) and Beverly Hughes (Colleen Camp). They own a grocery store, they are very strict on no drinking, smoking and no sex until you are married. They also don't tolerate porn or porno magazines, that Chris hides under his bed. His little adopted brother Danny Hughes is popular and has a better chance with a girl than Chris. Danny usually gets away with murder by h

In [22]:
pd.read_csv(f"{OUTPUT_DIR}/train_all_datasets_metrics.csv.gz", index_col=0)

Unnamed: 0,example_id,reference,candidate,reference_tokens,candidate_tokens,human_correctness_original,human_correctness,exact_match,meteor,rouge1,...,bleu-precision2,bleu-precision3,precision,recall,f1_score,csi,num_edits,edit_score,dataset,split
0,002b5d9aa346d492b02705ae2c9f4abd,Because he likes her a lot,He's a child and it's a very rare thing,"['Because', 'he', 'likes', 'her', 'a', 'lot']","['He', ""'s"", 'a', 'child', 'and', 'it', ""'s"", ...",1,0.00,0.0,0.153846,0.235294,...,0.00,0.0,0.090909,0.166667,0.117647,0.062500,8,1.333333,cosmosqa,train
1,00336fecc378c067330935b63ce0351d,Two of the kittens wouldn't have been killed,The kitten would have been killed,"['Two', 'of', 'the', 'kittens', 'would', ""n't""...","['The', 'kitten', 'would', 'have', 'been', 'ki...",1,0.00,0.0,0.676884,0.533333,...,0.25,0.0,0.666667,0.444444,0.533333,0.363636,4,0.500000,cosmosqa,train
2,003950650438f8a6446095e8f7c8e0bd,It didn't offer an accurate representation of ...,it does not have a clear idea about culture in...,"['It', 'did', ""n't"", 'offer', 'an', 'accurate'...","['it', 'does', 'not', 'have', 'a', 'clear', 'i...",3,0.50,0.0,0.099010,0.190476,...,0.00,0.0,0.090909,0.100000,0.095238,0.050000,9,1.000000,cosmosqa,train
3,0047b58deedccd8f5d793a79b665d575,If they wear something a lot,when they wear something,"['If', 'they', 'wear', 'something', 'a', 'lot']","['when', 'they', 'wear', 'something']",1,0.00,0.0,0.507663,0.600000,...,0.50,0.0,0.750000,0.500000,0.600000,0.428571,3,0.500000,cosmosqa,train
4,006f2ed4132dd86ff6f88aa834e8ce33,He might have been in a costumes party,He is a fan of Vince McMahon,"['He', 'might', 'have', 'been', 'in', 'a', 'co...","['He', 'is', 'a', 'fan', 'of', 'Vince', 'McMah...",1,0.00,0.0,0.126582,0.266667,...,0.00,0.0,0.285714,0.250000,0.266667,0.153846,7,0.875000,cosmosqa,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31064,fff6adbf5a7d8bb4c12417699b7bfc49,needed to reward Alex,need to reward Alex,"['needed', 'to', 'reward', 'Alex']","['need', 'to', 'reward', 'Alex']",5,1.00,0.0,0.992188,0.750000,...,0.50,0.0,0.750000,0.750000,0.750000,0.600000,1,0.250000,socialiqa,train
31065,fffa9767157147e739d14af04c736c54,gas up the car,go to the gym,"['gas', 'up', 'the', 'car']","['go', 'to', 'the', 'gym']",1,0.00,0.0,0.125000,0.250000,...,0.00,0.0,0.250000,0.250000,0.250000,0.142857,3,0.750000,socialiqa,train
31066,fffb5bf0cbb536ab6a767396015ac98b,selfish,confident,['selfish'],['confident'],1,0.00,0.0,0.000000,0.000000,...,0.00,0.0,0.000000,0.000000,0.000000,0.000000,1,1.000000,socialiqa,train
31067,fffc03fc5f061ad5cc913108eef37ae4,have a stuffed bear,have their teddy bear lying there,"['have', 'a', 'stuffed', 'bear']","['have', 'their', 'teddy', 'bear', 'lying', 't...",3,0.50,0.0,0.238095,0.400000,...,0.00,0.0,0.333333,0.500000,0.400000,0.250000,4,1.000000,socialiqa,train


In [19]:
!ls {OUTPUT_DIR}

dev_all_datasets_metrics.csv.gz  train_all_datasets_metrics.csv.gz
dev_cosmosqa_metrics.csv.gz	 train_cosmosqa_metrics.csv.gz
dev_drop_metrics.csv.gz		 train_drop_metrics.csv.gz
dev_mcscript_metrics.csv.gz	 train_mcscript_metrics.csv.gz
dev_narrativeqa_metrics.csv.gz	 train_narrativeqa_metrics.csv.gz
dev_quoref_metrics.csv.gz	 train_quoref_metrics.csv.gz
dev_socialiqa_metrics.csv.gz	 train_socialiqa_metrics.csv.gz
