# Colab

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/uni_bonn/nlp/final_project

/content/drive/MyDrive/uni_bonn/nlp/final_project


# Install modules

In [None]:
!pip install datasets
!pip install 'transformers[torch]'
!pip install torchmetrics
!pip install nltk sacrebleu

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15
Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m8.4 MB/s[0m eta [36m0

# Import modules

In [None]:
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import numpy as np
import json
from transformers import EncoderDecoderModel
import torchmetrics
from datasets import Dataset
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import gc
import torch

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
import math
import sacrebleu
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk import ngrams
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# Set model

In [None]:
model_name = "grammarly/coedit-large"

# Load dataset using Transformer datasets

In [None]:
data_files={
    "train":"./dataset/GYAFC_Corpus/data_train_preprocessed_coedit.json",
    "val":"./dataset/GYAFC_Corpus/data_val_preprocessed_coedit.json",
    "test":"./dataset/GYAFC_Corpus/data_test_preprocessed_coedit.json",
}

In [None]:
dataset = load_dataset("json", data_files=data_files)

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [None]:
encoder_max_length=24
decoder_max_length=24

def preprocess_function(examples, input_field="informal", target_field="formal.ref0"):
  inputs = [ex[input_field] for ex in examples["transformation"]]
  targets = [ex[target_field] for ex in examples["transformation"]]

  inputs = tokenizer(inputs, text_target=targets, padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = tokenizer(targets, padding="max_length", truncation=True, max_length=decoder_max_length)

  examples["input_ids"] = inputs.input_ids
  examples["attention_mask"] = inputs.attention_mask
  examples["decoder_input_ids"] = outputs.input_ids
  examples["decoder_attention_mask"] = outputs.attention_mask
  examples["labels"] = outputs.input_ids.copy()

  # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`.
  # We have to make sure that the PAD token is ignored
  # replace padding token id's of the labels by -100 so it's ignored by the loss
  examples["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in examples["labels"]]

  return examples

In [None]:
def create_multi_ref_dataset(dataset):
  for i, target_field in enumerate(['formal.ref0', 'formal.ref1', 'formal.ref2', 'formal.ref3']):
    new_dataset = dataset.map(
      preprocess_function,
      batched=True,
      fn_kwargs=dict(
          target_field=target_field
      )
    )

    if i == 0:
      dataset = dataset.add_column('input_ids', new_dataset['input_ids'])
      # dataset = dataset.add_column('token_type_ids', new_dataset['token_type_ids'])
      dataset = dataset.add_column('attention_mask', new_dataset['attention_mask'])
      dataset = dataset.add_column('decoder_input_ids', new_dataset['decoder_input_ids'])
      dataset = dataset.add_column('decoder_attention_mask', new_dataset['decoder_attention_mask'])
      dataset = dataset.add_column('labels', new_dataset['labels'])
    else:
      dataset = dataset.add_column(f'labels_{i}', new_dataset['labels'])

  return dataset

In [None]:
val_dataset = create_multi_ref_dataset(dataset['val'])
test_dataset = create_multi_ref_dataset(dataset['test'])

Map:   0%|          | 0/5665 [00:00<?, ? examples/s]

Map:   0%|          | 0/5665 [00:00<?, ? examples/s]

Map:   0%|          | 0/5665 [00:00<?, ? examples/s]

Map:   0%|          | 0/5665 [00:00<?, ? examples/s]

Map:   0%|          | 0/2748 [00:00<?, ? examples/s]

Map:   0%|          | 0/2748 [00:00<?, ? examples/s]

Map:   0%|          | 0/2748 [00:00<?, ? examples/s]

Map:   0%|          | 0/2748 [00:00<?, ? examples/s]

In [None]:
train_dataset = dataset['train'].map(
    preprocess_function,
    batched=True,
)

Map:   0%|          | 0/104562 [00:00<?, ? examples/s]

In [None]:
print(val_dataset)
print(train_dataset)

Dataset({
    features: ['id', 'topic', 'transformation', 'input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask', 'labels', 'labels_1', 'labels_2', 'labels_3'],
    num_rows: 5665
})
Dataset({
    features: ['id', 'topic', 'transformation', 'input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask', 'labels'],
    num_rows: 104562
})


In [None]:
print(val_dataset[0])

{'id': 0, 'topic': 'Family_Relationships', 'transformation': {'formal.ref0': 'If you are under 18, you have a big problem.', 'formal.ref1': 'You have a big problem if you are under eighteen.', 'formal.ref2': 'In the event your age is below 18 years, you are saddled with an enormous difficulty.', 'formal.ref3': 'If you are under the age of 18 you have a big problem.', 'informal': 'Write this more formally: If you are under 18 you have a Big Problem.'}, 'input_ids': [8733, 48, 72, 3, 19448, 10, 156, 25, 33, 365, 507, 25, 43, 3, 9, 2734, 5289, 5, 1, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0], 'decoder_input_ids': [156, 25, 33, 365, 14985, 25, 43, 3, 9, 600, 682, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'decoder_attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [156, 25, 33, 365, 14985, 25, 43, 3, 9, 600, 682, 5, 1, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -1

In [None]:
print(len(val_dataset))

5665


In [None]:
tokenizer.decode(train_dataset[1708]["input_ids"], skip_special_tokens=True)

'Write this more formally: Besides how old were you when you lost virginity and the tables turn on you'

# Metrics implementations

## Formality

In [None]:
import pandas as pd
from nltk import ngrams
from collections import defaultdict

In [None]:
def read_data():
    df_list=[]
    for n in range(1,6):
        df = pd.read_csv(f'./Metrics/Formality/annotated_ngrams/{n}-gram.txt', sep='\t')
        df_list.append(df)
    return df_list

In [None]:
def evaluate_sentence(sentence, ngram_data_list):
    average_scores=[]
    # Divide the sentence into n-grams
    for n in range(1, 6):
        # Initialize total score and count for averaging
        total_score = 0
        ngrams_list = list(ngrams(sentence.split(), n))
        count = 0
        for ngram in ngrams_list:
            ngram_key = ' '.join(ngram)
            # Check if the n-gram entry exists in the aggregated data
            if ngram_key in ngram_data_list[n-1]['n-gram'].values:
                # Get the total score and total occurrences from the aggregated data
                ngram_entry = ngram_data_list[n-1][ngram_data_list[n-1]['n-gram'] == ngram_key]
                total_score += ngram_entry['average_score'].values[0]
                count += 1
        # Calculate the average score for the sentence
        average_score = total_score / count if count > 0 else 0
        average_scores.append(average_score)
    return average_scores

In [None]:
# Example usage
sentence = "Mr. Ramesh said 70 percent of India's iron ore lay in states infiltrated by Maoists; production in this area is stalled at 16 million tons a year even though the area has the potential to produce 100 million tons."
formality_evaluation_data = read_data()
result = evaluate_sentence(sentence, formality_evaluation_data)
print(f"Sentence: `{sentence}`\nScore: {result}")

Sentence: `Mr. Ramesh said 70 percent of India's iron ore lay in states infiltrated by Maoists; production in this area is stalled at 16 million tons a year even though the area has the potential to produce 100 million tons.`
Score: [0.8237399883653753, 1.504411302878411, 1.7238738738738726, 1.7999999999999987, 1.7999999999999987]


## Translation Edit Rate Plus (TERp): TERp is calculated by measuring the matching flaw between machine-generated translations and human-created translation. The calculation could be represented as:

In [None]:
#Higher is better
def calculate_terp(hypothesis: str, reference: str, phrase_table: dict = None, edit_costs: dict = None) -> float:
    """
    Calculate TERp (Translation Edit Rate with partial credit) score between a hypothesis and a reference.

    Parameters:
    - hypothesis (str): The generated hypothesis or translation.
    - reference (str): The reference or ground truth translation.
    - phrase_table (dict, optional): A dictionary representing a phrase table for paraphrase information.
    - edit_costs (dict, optional): A dictionary containing weights for edit operations in the TERp calculation.

    Returns:
    - float: The TERp score, a value indicating the similarity between the hypothesis and the reference,
             where higher values are better.
    """
    hypothesis_tokens = hypothesis.split()
    reference_tokens = reference.split()

    # TERp by Stem Matches, Synonym Matches, and Phrase Substitutions
    stem_matches = calculate_stem_matches(hypothesis_tokens, reference_tokens)
    synonym_matches = calculate_synonym_matches(hypothesis_tokens, reference_tokens)

    phrase_substitutions = calculate_phrase_substitutions(hypothesis_tokens, reference_tokens, phrase_table, edit_costs) if phrase_table is not None and edit_costs is not None else 0


    # Calculate TERp score
    terp_score = (stem_matches + synonym_matches + phrase_substitutions) / (2*len(hypothesis_tokens))

    return terp_score

def calculate_stem_matches(hypothesis_tokens: list, reference_tokens: list) -> int:
    """
    Calculate the number of stem matches between two tokenized sequences.

    Parameters:
    - hypothesis_tokens (list): List of tokens in the hypothesis.
    - reference_tokens (list): List of tokens in the reference.

    Returns:
    - int: The number of stem matches.
    """
    stemmer = PorterStemmer()
    stem_matches = sum(1 for hyp_token, ref_token in zip(hypothesis_tokens, reference_tokens)
                      if stemmer.stem(hyp_token.lower()) == stemmer.stem(ref_token.lower()))
    return stem_matches

def calculate_synonym_matches(hypothesis_tokens: list, reference_tokens: list) -> int:
    """
    Calculate the number of synonym matches between two tokenized sequences.

    Parameters:
    - hypothesis_tokens (list): List of tokens in the hypothesis.
    - reference_tokens (list): List of tokens in the reference.

    Returns:
    - int: The number of synonym matches.
    """
    synonym_matches = sum(1 for hyp_token, ref_token in zip(hypothesis_tokens, reference_tokens)
                          if are_synonyms(hyp_token.lower(), ref_token.lower()))
    return synonym_matches

def are_synonyms(word1: str, word2: str) -> bool:
    """
    Check if two words are synonyms.

    Parameters:
    - word1 (str): The first word.
    - word2 (str): The second word.

    Returns:
    - bool: True if the words are synonyms, False otherwise.
    """
    synsets1 = wordnet.synsets(word1)
    synsets2 = wordnet.synsets(word2)

    return any(set1.wup_similarity(set2) > 0.7 for set1 in synsets1 for set2 in synsets2)

def calculate_phrase_substitutions(hypothesis_tokens: list, reference_tokens: list, phrase_table: dict, edit_costs: dict) -> float:
    """
    Calculate the cost of phrase substitutions between two tokenized sequences.

    Parameters:
    - hypothesis_tokens (list): List of tokens in the hypothesis.
    - reference_tokens (list): List of tokens in the reference.
    - phrase_table (dict): A dictionary representing a phrase table for paraphrase information.
    - edit_costs (dict): A dictionary containing weights for edit operations in the TERp calculation.

    Returns:
    - float: The total cost of phrase substitutions.
    """
    substitution_cost = 0

    for i in range(len(hypothesis_tokens)):
        for j in range(len(reference_tokens)):
            if (hypothesis_tokens[i], reference_tokens[j]) in phrase_table:
                # Retrieve paraphrase information from the phrase table
                paraphrase_info = phrase_table[(hypothesis_tokens[i], reference_tokens[j])]

                # Calculate the cost using the provided formula
                cost = (
                    edit_costs['w1'] +
                    edit_costs['w2'] * paraphrase_info['edit'] * math.log(paraphrase_info['probability']) +
                    edit_costs['w3'] * paraphrase_info['edit'] * paraphrase_info['probability'] +
                    edit_costs['w4'] * paraphrase_info['edit']
                )

                # Ensure the substitution cost is not negative
                substitution_cost += max(0, cost)

    return substitution_cost

def terp_alignment(hypothesis: str, reference: str, phrase_table: dict = None, edit_costs: dict = None) -> list:
    """
    Generate a word-level alignment between a hypothesis and a reference.

    Parameters:
    - hypothesis (str): The generated hypothesis or translation.
    - reference (str): The reference or ground truth translation.
    - phrase_table (dict, optional): A dictionary representing a phrase table for paraphrase information.
    - edit_costs (dict, optional): A dictionary containing weights for edit operations in the TERp calculation.

    Returns:
    - list of tuples: A list of tuples representing the word-level alignment, each tuple contains
                      (hypothesis_token, reference_token, alignment_type).
    """
    alignment = []

    for hyp_token, ref_token in zip(hypothesis.split(), reference.split()):
        if hyp_token == ref_token:
            alignment.append((hyp_token, ref_token, "Exact Match"))
        else:
            alignment.append((hyp_token, ref_token, "Mismatch"))

    return alignment

## Paraphrase In N-gram Changes (PINC): PINC is calculated by computing the percentage of n-grams that appear in the candidate sentence but not in the source sentence. The calculation could be represented as:

In [None]:
# Lower is better
def calculate_pinc(hypothesis: str, reference: str, n: int) -> float:
    """
    Calculate PINC (Precision-based n-gram Inclusion Count) score between a hypothesis and a reference.

    Parameters:
    - hypothesis (str): The generated hypothesis or translation.
    - reference (str): The reference or ground truth translation.
    - n (int): The size of n-grams for which to calculate the PINC score.

    Returns:
    - float: The PINC score, where lower values indicate better similarity to the reference.
    """
    hypothesis_split = hypothesis.split()
    reference_split = reference.split()

    hypothesis_ngrams = set(ngrams(hypothesis_split, n))
    reference_ngrams = set(ngrams(reference_split, n))
    new_ngrams = hypothesis_ngrams - reference_ngrams
    hypo_ngram_len = len(hypothesis_ngrams)
    pinc_score = len(new_ngrams) / hypo_ngram_len if hypo_ngram_len != 0 else 0

    return pinc_score

## Test

In [None]:
# Example usage:
hypothesis_sentence = "This is an example sentence."
reference_sentence = "This is an example sentence."

# Calculate TERp score
terp_score = calculate_terp(hypothesis_sentence, reference_sentence)
print(f"TERp Score: {terp_score}")

# Generate alignment
alignment = terp_alignment(hypothesis_sentence, reference_sentence)
print("Alignment:", alignment)

TERp Score: 0.8
Alignment: [('This', 'This', 'Exact Match'), ('is', 'is', 'Exact Match'), ('an', 'an', 'Exact Match'), ('example', 'example', 'Exact Match'), ('sentence.', 'sentence.', 'Exact Match')]


In [None]:
pinc_score = calculate_pinc(hypothesis_sentence, reference_sentence, 2)
print(f"PINC Score: {pinc_score}")

PINC Score: 0.0


## Aggregate functions

In [None]:
def terp_metric(preds, refs):
  scores = np.zeros(len(preds), dtype=float)
  for i in range(len(preds)):
    pred = preds[i]
    ref = refs[i]
    score = np.min(np.array(list(map(lambda x: calculate_terp(pred, x), ref))))
    scores[i] = score

  return np.mean(scores)

In [None]:
def pinc_metric(preds, refs, n=2):
  scores = np.zeros(len(preds), dtype=float)
  for i in range(len(preds)):
    pred = preds[i]
    ref = refs[i]
    score = np.min(np.array(list(map(lambda x: calculate_pinc(pred, x, n), ref))))
    scores[i] = score

  return np.mean(scores)

In [None]:
def formality_metric(preds):
  formality_evaluation_data = read_data()
  formality_scores = np.array(list(map(lambda x: evaluate_sentence(x, formality_evaluation_data), preds)))
  return np.mean(formality_scores)

# Compute total metric

In [None]:
def compute_metrics(pred_str, label_str):
  preds = [" ".join([token.text for token in nlp(line)]) for line in pred_str]
  labels = [[" ".join([token.text for token in nlp(line)]) for line in references] for references in label_str]

  # Calculate metrics
  bleu1 = torchmetrics.text.BLEUScore(n_gram=1)
  bleu2 = torchmetrics.text.BLEUScore(n_gram=2)
  bleu3 = torchmetrics.text.BLEUScore(n_gram=3)
  bleu4 = torchmetrics.text.BLEUScore(n_gram=4)
  sacrebleu = torchmetrics.text.SacreBLEUScore()
  ter = torchmetrics.text.TranslationEditRate(normalize=True, lowercase=False)

  bleu1(preds, labels)
  bleu2(preds, labels)
  bleu3(preds, labels)
  bleu4(preds, labels)
  ter(preds, labels)
  sacrebleu(preds, labels)

  return dict(
      bleu1=bleu1.compute(),
      bleu2=bleu2.compute(),
      bleu3=bleu3.compute(),
      bleu4=bleu4.compute(),
      ter=ter.compute(),
      sacrebleu=sacrebleu.compute(),
      terp=terp_metric(preds, labels),
      pinc=pinc_metric(preds, labels, n=4),
      formality=formality_metric(preds)
  )

# Inference


In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")

config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [None]:
train_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)
val_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels", 'labels_1', 'labels_2', 'labels_3'],
)
test_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels", 'labels_1', 'labels_2', 'labels_3'],
)

In [None]:
from tqdm import tqdm

In [None]:
def create_inference(dataset):
  out = list()
  for i in tqdm(range(len(dataset))):
    input_ids = dataset[i:i+1]['input_ids'].to("cuda")
    outputs = model.generate(input_ids, max_length=256)
    edited_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    out.append(edited_text)

  return out

In [None]:
val_inference = create_inference(val_dataset)

100%|██████████| 5665/5665 [43:12<00:00,  2.19it/s]


In [None]:
import pickle
PIK = "./output/preprocessed/coedit_pretrained_val_inference.pkl"
with open(PIK, "wb") as f:
    pickle.dump(val_inference, f)

In [None]:
test_inference = create_inference(test_dataset)

100%|██████████| 2748/2748 [21:05<00:00,  2.17it/s]


In [None]:
import pickle

In [None]:
PIK = "./output/preprocessed/coedit_pretrained_test_inference.pkl"
with open(PIK, "wb") as f:
    pickle.dump(test_inference, f)

# Validation

In [None]:
import pickle

PIK = "./output/preprocessed/coedit_pretrained_val_inference.pkl"
with open(PIK, "rb") as file:
    val_inference = pickle.load(file)

In [None]:
PIK = "./output/preprocessed/coedit_pretrained_test_inference.pkl"
with open(PIK, "rb") as file:
    test_inference = pickle.load(file)

In [None]:
print(test_inference)

['What if it is a rebound relationship for both of you?', 'Good luck in your search for the one!', 'Why do so many people seem to want someone that they cannot have?', 'Do you like to argue and fight?', 'If you are determined, that is the best option.', 'My ex-husband was unfaithful for three years.', 'You should try to work it out, but do not let him keep you locked up at home.', 'Just ask him once in a while.', 'Men get bored and tire much before women do.', 'She is strong and loving, but I was not as happy as I had been.', 'That is one way of saying I do not like you.', 'King sang a song about this, "Only Your Mother Loves You."', 'He loves you as well, but time will tell.', 'He wants me to change my appearance.', 'I know I have someone to come home to.', 'Did you feel homosexual because I want to have sex?', 'Do you want all of your friends to be forever?', 'It does not mean that you are unattractive.', 'I would enjoy going to a club and dancing.', 'Do not go anywhere you went with

In [None]:
def create_reference(dataset):
  data = dataset['transformation']

  ref0 = [data[i]['formal.ref0'] for i in range(len(data))]
  ref1 = [data[i]['formal.ref1'] for i in range(len(data))]
  ref2 = [data[i]['formal.ref2'] for i in range(len(data))]
  ref3 = [data[i]['formal.ref3'] for i in range(len(data))]

  return list(zip(ref0, ref1, ref2, ref3))

In [None]:
val_references =  create_reference(val_dataset)
test_references =  create_reference(test_dataset)

In [None]:
print(test_references[1])

('Good luck in your search for your perfect match.', 'Good luck in your search for a new love.', 'Good luck in your search for the one!', 'I wish you luck in your search for "the one".')


In [None]:
print(test_inference[1])

Good luck in your search for the one!


## Val set

In [None]:
val_metrics = compute_metrics(val_inference, val_references)

In [None]:
print(val_metrics)

{'bleu1': tensor(0.8309), 'bleu2': tensor(0.7445), 'bleu3': tensor(0.6697), 'bleu4': tensor(0.6008), 'ter': tensor(0.3316), 'sacrebleu': tensor(0.6014), 'terp': 0.0837746859062184, 'pinc': 0.5776668958033878, 'formality': -0.0495621363435475}


In [None]:
PIK = "./output/preprocessed/coedit_pretrained_val_metrics.pkl"
with open(PIK, "wb") as f:
    pickle.dump(val_metrics, f)

## Test set

In [None]:
test_metrics = compute_metrics(test_inference, test_references)

In [None]:
print(test_metrics)

{'bleu1': tensor(0.8212), 'bleu2': tensor(0.7379), 'bleu3': tensor(0.6656), 'bleu4': tensor(0.5993), 'ter': tensor(0.3289), 'sacrebleu': tensor(0.5997), 'terp': 0.09003509702685224, 'pinc': 0.5691812712923993, 'formality': -0.05235542468150741}


In [None]:
PIK = "./output/preprocessed/coedit_pretrained_test_metrics.pkl"
with open(PIK, "wb") as f:
    pickle.dump(test_metrics, f)