## Imports

We start with required imports.

In [1]:
import os
import json # Just for pretty printing the resulting dict.

from jury import Jury
from jury.metrics import load_metric

In [2]:
from typing import List


def read_from_txt(path: str) -> List[str]:
    with open(path, "r") as f:
        data = f.readlines()
    return data

## Task 1: Machine Translation

We evaluate sample machine translation generation outputs and their references. Feel free to play around with the samples below. Alternatively, you can load your own predictions and references using helper function `read_from_txt()`, where each line will be treated as a separate prediction or references, and order needs to be consistent between prediction and reference txt file.

In [3]:
mt_predictions = [
    ["the cat is on the mat", "There is cat playing on the mat"], 
    ["Look! a wonderful day."]
]
mt_references = [
    ["the cat is playing on the mat.", "The cat plays on the mat."],
    ["Today is a wonderful day", "The weather outside is wonderful."],
]

# mt_predictions = read_from_txt("/path/to/predictions.txt")
# mt_references = read_from_txt("/path/to/references.txt")

### Define Metrics

Here define your metrics used to evaluate MT prediction and references. You can either use load function from jury where you can pass additional parameters to specified metric, or specify as string, which will use default parameters.

**NOTE:** Computation of BERTScore may take some time as it will download a model for computing embeddings. Thus, we here provide `albert-base-v1`, but you can uncomment the previous line where it uses default model `roberta-large`.

[Here](https://huggingface.co/transformers/pretrained_models.html), you can observe model sizes, parameter counts, etc.

In [4]:
MT_METRICS = [
    load_metric(metric_name="bleu", resulting_name="bleu_1", params={"max_order": 1}),
    load_metric(metric_name="bleu", resulting_name="bleu_2", params={"max_order": 2}),
    load_metric("meteor", "meteor"),
    load_metric("rouge"),
    load_metric("sacrebleu"),
#     load_metric("bertscore"), # Using default model for lang en
    load_metric("bertscore", params={"model_type": "albert-base-v1"})  # Using smaller model to reduce download time.
]

# Alternatively
# MT_METRICS = [
#     "bleu",
#     "meteor",
#     "rouge"
# ]

RUN_CONCURRENT = True  # set False to disable concurrency.

In [5]:
# Compute scores

if RUN_CONCURRENT:
    os.environ["TOKENIZERS_PARALLELISM"] = "true"
else:
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

mt_jury = Jury(metrics=MT_METRICS, run_concurrent=RUN_CONCURRENT)
scores = mt_jury.evaluate(predictions=mt_predictions, references=mt_references)

100%|██████████████████████████████████████████| 2/2 [00:00<00:00, 49932.19it/s]
  0%|                                                     | 0/2 [00:00<?, ?it/s][nltk_data] Downloading package wordnet to
[nltk_data]     /home/devrimcavusoglu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
100%|█████████████████████████████████████████████| 2/2 [00:03<00:00,  1.89s/it]
100%|█████████████████████████████████████████████| 2/2 [00:04<00:00,  2.33s/it]
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/devrimcavusoglu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
100%|█████████████████████████████████████████████| 2/2 [00:05<00:00,  2.65s/it]
100%|█████████████████████████████████████████████| 2/2 [00:05<00:00,  2.67s/it]
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/devrimcavusoglu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
100%|█████████████████████████████████████████████| 2/2 [00:05<00:00,  2.9

In [6]:
# Display results
print(json.dumps(scores, indent=4))

{
    "empty_predictions": 0,
    "total_items": 2,
    "bleu_1": 0.7920502936517768,
    "bleu_2": 0.7225612529515497,
    "meteor": 0.5420511682934044,
    "rougeL": 0.7948717948717947,
    "SacreBLEU": 0.3898310279399514,
    "BERTScore": 0.7431023120880127
}


## Task 2: Question Answering

For question answering task, commonly used evaluation metric is exact match or F1 score, datasets package allows this through a metric named "squad". Same interface is available here as well, with a single exception that in order to seamlessly compute, concat and output resulting scores Jury restrict each metric to compute a single score, by default SQUAD implementation computes (squad's) F1 score.

In [7]:
qa_predictions = ["1917", "Albert Einstein", "foo bar"]
qa_references = ["1917", "Einstein", "foo bar foobar"]

QA_METRICS = [
    "squad"
]

In [8]:
qa_jury = Jury(metrics=QA_METRICS, run_concurrent=False)
scores = qa_jury.evaluate(predictions=qa_predictions, references=qa_references)
print(json.dumps(scores, indent=4))

100%|██████████████████████████████████████████| 3/3 [00:00<00:00, 62291.64it/s]


{
    "empty_predictions": 0,
    "total_items": 3,
    "SQUAD": 0.8222222222222223
}


## Defining a custom metric

To define a custom metric, you only need to extend `jury.metrics.Metric` class and implement the required functions as desired. We create a metric to compute precision for our QA task above.

In [40]:
from collections import Counter
from typing import Dict

from jury.metrics import Metric


class Precision(Metric):
    """
    Compute simple precision as 
        Average( # of matching tokens / # of tokens in prediction )
    """
    def __init__(self, metric_name: str = None, resulting_name: str = None, params: Dict = None):
        metric_name = self.__class__.__name__ if metric_name is None else metric_name
        resulting_name = metric_name if resulting_name is None else resulting_name
        super().__init__(metric_name=metric_name, resulting_name=resulting_name, params=params)
        
    def _preprocess(self, predictions, references):
        predictions = [p.split() for p in predictions]
        references = [r.split() for r in references]
        return predictions, references
    
    def _compute(self, predictions, references):
        scores = []
        for pred, ref in zip(predictions, references):
            score = 0
            pred_counts = Counter(pred)
            ref_counts = Counter(ref)
            for token, pred_count in pred_counts.items():
                if token in ref_counts:
                    score += min(pred_count, ref_counts[token])  # Intersection count
            scores.append(score /  len(pred))
        avg_score = sum(scores) / len(scores)
        return {self.resulting_name: avg_score}
    
    def compute(self, predictions, references) -> Dict[str, float]:
        """Required to be used by jury."""
        predictions, references = self._preprocess(predictions, references)
        return self._compute(predictions, references)
        

In [38]:
from jury.metrics.squad import SQUAD

QA_METRICS = [
    SQUAD(),
    Precision()
]

In [39]:
qa_jury = Jury(metrics=QA_METRICS, run_concurrent=False)
scores = qa_jury.evaluate(predictions=qa_predictions, references=qa_references)
print(json.dumps(scores, indent=4))

100%|██████████████████████████████████████████| 3/3 [00:00<00:00, 56679.78it/s]


{
    "empty_predictions": 0,
    "total_items": 3,
    "SQUAD": 0.8222222222222223,
    "Precision": 0.8333333333333334
}
