# Requirements

In [None]:
!pip install --quiet bert_score datasets nltk pandas sumeval

In [None]:
import json
import re
from os import makedirs
from pathlib import Path, PurePath
from statistics import mean

import numpy as np
import pandas as pd

import datasets
from datasets import load_metric, Metric
from datasets.config import importlib_metadata, version

from bert_score import utils
from sumeval.metrics.rouge import RougeCalculator
from nltk.translate import meteor_score

In [None]:
root = '..'

# METEOR Definitions

In [None]:
NLTK_VERSION = version.parse(importlib_metadata.version("nltk"))
if NLTK_VERSION >= version.Version("3.6.4"):
    from nltk import word_tokenize


_CITATION = """\
@inproceedings{banarjee2005,
  title     = {{METEOR}: An Automatic Metric for {MT} Evaluation with Improved Correlation with Human Judgments},
  author    = {Banerjee, Satanjeev  and Lavie, Alon},
  booktitle = {Proceedings of the {ACL} Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and/or Summarization},
  month     = jun,
  year      = {2005},
  address   = {Ann Arbor, Michigan},
  publisher = {Association for Computational Linguistics},
  url       = {https://www.aclweb.org/anthology/W05-0909},
  pages     = {65--72},
}
"""

_DESCRIPTION = """\
METEOR, an automatic metric for machine translation evaluation
that is based on a generalized concept of unigram matching between the
machine-produced translation and human-produced reference translations.
Unigrams can be matched based on their surface forms, stemmed forms,
and meanings; furthermore, METEOR can be easily extended to include more
advanced matching strategies. Once all generalized unigram matches
between the two strings have been found, METEOR computes a score for
this matching using a combination of unigram-precision, unigram-recall, and
a measure of fragmentation that is designed to directly capture how
well-ordered the matched words in the machine translation are in relation
to the reference.
METEOR gets an R correlation value of 0.347 with human evaluation on the Arabic
data and 0.331 on the Chinese data. This is shown to be an improvement on
using simply unigram-precision, unigram-recall and their harmonic F1
combination.
"""

_KWARGS_DESCRIPTION = """
Computes METEOR score of translated segments against one or more references.
Args:
    predictions: list of predictions to score. Each prediction
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    alpha: Parameter for controlling relative weights of precision and recall. default: 0.9
    beta: Parameter for controlling shape of penalty as a function of fragmentation. default: 3
    gamma: Relative weight assigned to fragmentation penalty. default: 0.5
Returns:
    'meteor': meteor score.
Examples:
    >>> meteor = datasets.load_metric('meteor')
    >>> predictions = ["It is a guide to action which ensures that the military always obeys the commands of the party"]
    >>> references = ["It is a guide to action that ensures that the military will forever heed Party commands"]
    >>> results = meteor.compute(predictions=predictions, references=references)
    >>> print(round(results["meteor"], 4))
    0.6944
"""


@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Meteor(datasets.Metric):
    def _info(self):
        return datasets.MetricInfo(
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            features=datasets.Features(
                {
                    "predictions": datasets.Value("string", id="sequence"),
                    "references": datasets.Value("string", id="sequence"),
                }
            ),
            codebase_urls=["https://github.com/nltk/nltk/blob/develop/nltk/translate/meteor_score.py"],
            reference_urls=[
                "https://www.nltk.org/api/nltk.translate.html#module-nltk.translate.meteor_score",
                "https://en.wikipedia.org/wiki/METEOR",
            ],
        )

    def _download_and_prepare(self, dl_manager):
        import nltk

        nltk.download("wordnet")
        if NLTK_VERSION >= version.Version("3.6.5"):
            nltk.download("punkt")
        if NLTK_VERSION >= version.Version("3.6.6"):
            nltk.download("omw-1.4")

    def _compute(self, predictions, references, alpha=0.9, beta=3, gamma=0.5):
        if NLTK_VERSION >= version.Version("3.6.5"):
            scores = [
                meteor_score.single_meteor_score(
                    word_tokenize(ref, language='turkish'), word_tokenize(pred, language='turkish'), alpha=alpha, beta=beta, gamma=gamma
                )
                for ref, pred in zip(references, predictions)
            ]
        else:
            scores = [
                meteor_score.single_meteor_score(ref, pred, alpha=alpha, beta=beta, gamma=gamma)
                for ref, pred in zip(references, predictions)
            ]

        return {"meteor": np.mean(scores)}

# Function Definitions

In [None]:
def my_lower(text):
  fix = re.compile(r'i̇')
  return fix.sub(r'i', text).lower()

In [None]:
def get_bleu_inputs(predictions, references):
  bleu_references = [[my_lower(r).split()] for r in references]
  bleu_predictions = [my_lower(p).split() for p in predictions]

  return bleu_predictions, bleu_references

In [None]:
def calc_rouge(predictions, references):
    result = {'rouge-1': 0,
              'rouge-2': 0,
              'rouge-3': 0,
              'rouge-4': 0,
              'rouge-l': 0,
              'rouge-be': 0}

    rouge = RougeCalculator(lang ='tr', stemming = True)

    for p, r in zip(predictions, references):
        result['rouge-1'] += rouge.rouge_n(summary = p, references = r, n = 1)
        result['rouge-2'] += rouge.rouge_n(summary = p, references = r, n = 2)
        result['rouge-3'] += rouge.rouge_n(summary = p, references = r, n = 3)
        result['rouge-4'] += rouge.rouge_n(summary = p, references = r, n = 4)
        result['rouge-l'] += rouge.rouge_l(summary = p, references = r)

    for key, value in result.items():
        result[key] = value / len(predictions) 

    return result

In [None]:
def save_results(results, model_alias = 'mt5-base', dataset = 'ost', run = 1, cross_testing = False, foldername = 'results'):
  if cross_testing:
    results_folder = PurePath(root, 'testing-files', f'cross-{foldername}')
  else:
    results_folder = PurePath(root, 'testing-files', foldername)

  makedirs(results_folder, exist_ok = True)

  with open(PurePath(results_folder, f'results-{model_alias}-{dataset}-{run}.json'), 'w', encoding = 'utf-8') as f:
      json.dump(results, f, ensure_ascii = False, indent = 4)

In [None]:
def calc_bleu(test, prediction_alias = 'prediction', tgt_alias = 'tgt'):
    preds, refs = get_bleu_inputs(test[prediction_alias], test[tgt_alias])
    return bleu.compute(predictions = preds, references = refs)

In [None]:
def calc_ter(predictions, references):
    references = [[reference] for reference in references]
    return ter.compute(predictions = predictions, references = references)

In [None]:
def calc_bertscore(test, lang = 'tr', prediction_alias = 'prediction', tgt_alias = 'tgt'):
    result = bertscore.compute(predictions = test[prediction_alias], references = test[tgt_alias], lang = lang)

    result['precision'] = mean(result['precision'])
    result['f1'] = mean(result['f1'])
    result['recall'] = mean(result['recall'])

    return result

In [None]:
# Added this dictionary to support the uncased model used in BERTScore evaluation
utils.lang2model.update({'tr-uncased': 'dbmdz/bert-base-turkish-uncased'})

utils.model2layers['dbmdz/bert-base-turkish-uncased'] = 10

In [None]:
bleu = load_metric('bleu')
bertscore = load_metric('bertscore')
meteor = Meteor()
meteor.download_and_prepare()
ter = load_metric('ter')

# Score Calculation

In [None]:
models = ['mt5-base', 'trbart']
datasets = ['ost', 'tat']
runs = [1, 2, 3, 4]

# When cross testing is enabled the models are evaluated on the
# test splits of the datasets they were not trained on
cross_testing = False

if cross_testing:
  csv_folder = PurePath(root, 'testing-files', 'csv-cross')
else:
  csv_folder = PurePath(root, 'testing-files', 'csv')

In [None]:
for model_alias in models:
    for dataset in datasets:
      data = []
      for run in runs:
          test_path = PurePath(csv_folder, f'paraphrases-{model_alias}-{dataset}-{run}.csv')
          test = pd.read_csv(test_path)
          results = dict()
          results['bleu'] = calc_bleu(test)
          results['rouge'] = calc_rouge(test['prediction'], test['tgt'])
          results['meteor'] = meteor.compute(predictions = test['prediction'], references = test['tgt'])
          results['ter'] = calc_ter(test['prediction'], test['tgt'])
          results['bertscore-uncased'] = calc_bertscore(test, lang = 'tr-uncased')
          results['bertscore-cased'] = calc_bertscore(test, lang = 'tr')
          save_results(results, model_alias, dataset, run, cross_testing = cross_testing)
          data.append(results)
      display(pd.DataFrame(data))