In [1]:
import os
import json
import spacy
import datasets


ROOT_DIR = "../.."
ORIGINAL_MOCHA_DIR = f"{ROOT_DIR}/data/metric-modeling/mocha"
SPLITS = ("train", "dev", "test")
DATASETS = ('cosmosqa', 'drop', 'mcscript', 'narrativeqa', 'quoref', 'socialiqa')

PREPROC_DIR = f"{ROOT_DIR}/data/raw_splits"
os.makedirs(PREPROC_DIR, exist_ok=True)

In [2]:
filepath = f"{ORIGINAL_MOCHA_DIR}/{SPLITS[0]}.json"

data = json.load(open(filepath))
datasets = list(data.keys())

In [3]:
def read_json_dataset(parent_dir, filename, dataset=None) -> dict:
    """Loads the dataset from the specified path. 
    
    It assumes the dataset is in JSON format and that is
    represented as {tag1: {examples}, tag2: {...}, ...}
    where tag1 and tag2 are dataset tags that the user
    can specify. If none are specified all the datasets
    will be returned.
    """
    data = json.load(open(f"{parent_dir}/{filename}.json"))
    
    if dataset is None:
        datasets = list(data.keys())
    else:
        datasets = dataset if isinstance(dataset, list) else [dataset]
    
    data = {d: datum for d, datum in data.items() if d in datasets}
    return data


# Sanity check (:
data = read_json_dataset(ORIGINAL_MOCHA_DIR, "dev", "drop")

In [56]:
from bert_score import score as BERT_SCORE
from datasets import load_metric
from pycocoevalcap.meteor.meteor import Meteor as pccMeteor
from pycocoevalcap.rouge.rouge import Rouge as pccRouge
from pycocoevalcap.bleu.bleu import Bleu as pccBleu


def remove_punc(s):
    return s.replace('?', '').replace('.', '').replace('!', '')

def update_examples(examples: dict, key, values):
    assert len(examples) == len(values)

    for example, value in zip(examples, values):
        example[key] = value


def add_bleu_predictions(mocha_dataset, order: int=4):
    BLEU = pccBleu(order)

    for dataset, examples in mocha_dataset.items():
        refs = {i: [remove_punc(instance['reference'])] for i, instance in
                enumerate(examples.values())}
        cands = {i: [remove_punc(instance['candidate'])] for i, instance in
                 enumerate(examples.values())}
        
        # compute_scores return (aggregate-bleu, instance-wise bleu)
        # -- by accessing the first index, we get the bleu per instance
        bleu_scores = BLEU.compute_score(refs, cands, verbose=0)[1]
        
        for i in range(order):
            update_examples(examples.values(), f"bleu{i+1}", bleu_scores[i])


def add_meteor_predictions(mocha_dataset):
    METEOR = pccMeteor()

    for dataset, examples in mocha_dataset.items():
        refs = {i: [remove_punc(instance['reference'])] for i, instance in
                enumerate(examples.values())}
        cands = {i: [remove_punc(instance['candidate'])] for i, instance in
                 enumerate(examples.values())}
        pred_scores = METEOR.compute_score(refs, cands)[1]
        update_examples(examples.values(), "meteor", pred_scores)


def add_rouge_predictions(mocha_dataset):
    ROUGE = pccRouge()

    for dataset, examples in mocha_dataset.items():
        refs = {i: [remove_punc(instance['reference'])] for i, instance in
                enumerate(examples.values())}
        cands = {i: [remove_punc(instance['candidate'])] for i, instance in
                 enumerate(examples.values())}
        pred_scores = ROUGE.compute_score(refs, cands)[1]
        update_examples(examples.values(), "rougeL", pred_scores)

        
def add_bertscore_predictions(mocha_dataset):
    for dataset, examples in mocha_dataset.items():
        refs = [remove_punc(instance['reference']) for instance in examples.values()]
        cands = [remove_punc(instance['candidate']) for instance in examples.values()]
        pred_scores = BERT_SCORE(cands, refs, lang='en')[-1].tolist()
        update_examples(examples.values(), "bertscore", pred_scores)


def add_bleurt_predictions(mocha_dataset):
    BLEURT = load_metric("bleurt", keep_in_memory=True)

    for dataset, examples in mocha_dataset.items():
        for example in examples.values():
            scores = BLEURT.compute(predictions=[remove_punc(example["candidate"])],
                                    references=[remove_punc(example["reference"])])
            example["bleurt"] = scores["scores"][0]


def add_edit_score(mocha_dataset, **kwargs):
    """Compute the translation error rate to quantify the edit operations.
    
    We use the implementation available at 
    https://github.com/huggingface/datasets/tree/fad939b5e17b672a4eda7de2cd8e24d98f3d5b26/metrics/ter.
    
    TER score represents the fraction of edits divided over the reference length.

    Keywords Arguments
    ------------------
    normalized: bool, defaults to False
        If true, applies basic tokenization and normalization to sentences.
    ignore_punct: bool, defaults to False
        If true, applies basic tokenization and normalization to sentences.
    case_sensitive: bool, defaults to False
        If false, makes all predictions and references lowercase to ignore
        differences in casing.
    """
    EDIT_RATIO = load_metric("ter", keep_in_memory=True)
    
    for dataset, examples in mocha_dataset.items():
        for example in examples.values():
            candidate = remove_punc(example["candidate"])
            reference = remove_punc(example["reference"])

            scores = EDIT_RATIO.compute(predictions=[candidate], references=[[reference]])
            example["edit_ratio"] = scores["score"] / 100


def add_word_edit_rate(mocha_dataset):
    """Compute word edit rate. 
    
    The formula is like the character_edit_rate but using words
    rather than characters.
    """
    # https://github.com/huggingface/datasets/tree/fad939b5e17b672a4eda7de2cd8e24d98f3d5b26/metrics/wer
    # !pip install jiwer
    WER = load_metric("wer", keep_in_memory=True)
    
    for dataset, examples in mocha_dataset.items():
        for example in examples.values():
            candidate = remove_punc(example["candidate"])
            reference = remove_punc(example["reference"])

            scores = WER.compute(predictions=[candidate], references=[reference])
            example["word_edit_score"] = scores
    
def add_recall(mocha_dataset):
    from collections import Counter

    for dataset, examples in mocha_dataset.items():
        for example in examples.values():
            candidate = remove_punc(example["candidate"]).split()
            reference = remove_punc(example["reference"]).split()

            true_tks, pred_tks = Counter(reference), Counter(candidate)
        
            tp = sum((true_tks & pred_tks).values())
            
            if tp == 0:
                example["recall"] = 0
            else:
                example["recall"] = tp / len(reference)

            example["tp"] = tp
            example["fn"] = len(reference) - tp


def add_precision(mocha_dataset):
    from collections import Counter

    for dataset, examples in mocha_dataset.items():
        for example in examples.values():
            candidate = remove_punc(example["candidate"]).split()
            reference = remove_punc(example["reference"]).split()

            true_tks, pred_tks = Counter(reference), Counter(candidate)
        
            tp = sum((true_tks & pred_tks).values())
            example["precision"] = 0 if tp == 0 else tp / len(candidate)

            example["tp"] = tp
            example["fp"] = len(candidate) - tp

def add_rouge_order_n(mocha_dataset, order):
    for dataset, examples in mocha_dataset.items():
        raise NotImplementedError    

def add_first_error_position(mocha_dataset):    
    for dataset, examples in mocha_dataset.items():
        raise NotImplementedError

            
def add_word_movers_distance(mocha_dataset):
    # https://markroxor.github.io/gensim/static/notebooks/WMD_tutorial.html
    raise NotImplementedError

def add_sari(mocha_dataset):
    """https://github.com/huggingface/datasets/tree/master/metrics/sari"""
    for dataset, examples in mocha_dataset.items():
        raise NotImplementedError

In [54]:
from collections import Counter
len("hello it's me")

13

In [57]:
# add_bleu_predictions(data)
# add_meteor_predictions(data)
# add_rouge_predictions(data)
# add_bertscore_predictions(data)
# add_bleurt_predictions(data)
add_word_edit_rate(data)
add_edit_score(data)
add_recall(data)

In [59]:
data["drop"]["01d2dcd528219ac0739e8e07030ae88b"]

{'candidate': 'between the ages of 10 to 29',
 'context': 'The age distribution, , in Lausanne is; 11,818 children or 9.4% of the population are between 0 and 9 years old and 12,128 teenagers or 9.7% are between 10 and 19. Of the adult population, 21,101 people or 16.8% of the population are between 20 and 29 years old. 22,158 people or 17.6% are between 30 and 39, 18,016 people or 14.4% are between 40 and 49, and 13,940 people or 11.1% are between 50 and 59. The senior population distribution is 11,041 people or 8.8% of the population are between 60 and 69 years old, 8,277 people or 6.6% are between 70 and 79, there are 5,896 people or 4.7% who are between 80 and 89, and there are 1,171 people or 0.9% who are 90 and older.',
 'metadata': {'scores': [5, 5], 'source': 'naqanet'},
 'question': 'Are more people between the ages of 10 to 29 or 80 and older?',
 'reference': '10 to 29',
 'score': 5,
 'bleurt': -0.44349393248558044,
 'char_edit_score': 2.5,
 'edit_ratio': 1.333333333333333,
 

In [17]:
!pip install jiwer


Collecting jiwer
  Using cached jiwer-2.3.0-py3-none-any.whl (15 kB)
Collecting python-Levenshtein==0.12.2
  Using cached python_Levenshtein-0.12.2-cp39-cp39-linux_x86_64.whl
Installing collected packages: python-Levenshtein, jiwer
Successfully installed jiwer-2.3.0 python-Levenshtein-0.12.2
