<a href="https://colab.research.google.com/github/mille-s/WebNLG-2020_Metrics/blob/main/WebNLG%2B_eval_scripts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title 1 - Download resources and clone repos
# RUN THIS CELL FIRST; when done, drag the predicted files in the "hypotheses" folder

from IPython.display import clear_output

# Download WebNLG+ code
! git clone https://github.com/WebNLG/GenerationEval.git

# Download BLEURT
! git clone https://github.com/google-research/bleurt.git
! wget https://storage.googleapis.com/bleurt-oss/bleurt-base-128.zip
! unzip bleurt-base-128.zip
! rm bleurt-base-128.zip
! pip3 install --upgrade pip
! pip3 install /content/bleurt/. --user
! mv bleurt /content/GenerationEval/metrics
! mv bleurt-base-128 /content/GenerationEval/metrics/bleurt

# Download METEOR
! wget https://www.cs.cmu.edu/~alavie/METEOR/download/meteor-1.5.tar.gz
! tar -xvf meteor-1.5.tar.gz
! mv meteor-1.5 /content/GenerationEval/metrics
! rm meteor-1.5.tar.gz

# Download repos for creating references
! git clone https://gitlab.com/webnlg/corpus-reader.git
! git clone https://gitlab.com/shimorina/webnlg-dataset.git
import os
import os.path
hypotheses = '/content/hypotheses'
if not os.path.exists(hypotheses):
  os.makedirs(hypotheses)

# Human eval results and more data
#! git clone https://github.com/WebNLG/challenge-2020.git

# Install Python dependencies
! pip3 install nltk==3.5
! pip3 install pyter3==0.3
! pip3 install razdel==0.5.0
# ! pip3 install tabulate==0.8.7
! pip3 install tabulate==0.9
# ! pip3 install bert-score==0.3.5
! pip3 install bert-score==0.3.13
! pip install transformers==3.0.1

# Download nltk 'punkt'
import sys
import nltk
nltk.download('punkt')

clear_output()

IMPORTANT: Now drag and drop the predicted files in the "hypotheses" folder.

> Click on the folder icon on the left if you don't see the folders.



In [None]:
#@title 2 - Adapt original codes to Colab
# RUN THIS CELL SECOND
# Adapt evaluation code to Colab; creates an eval_edited.py file which will be run in the final cell
# Adapt reference creation code to Colab; creates a benchmark_reader-edited.py and a generate_references-edited.py files which will be run in the next cell

import codecs
import re

def edit_file(code_lines, landmark, fo, new_code):
  x = 0
  stop = 'no'
  for line in code_lines:
    if stop == 'no':
      if re.search('^from benchmark_reader import Benchmark', line):
        fo.write('from benchmark_reader_edited import Benchmark\n')
      elif re.search('^from benchmark_reader import select_files', line):
        fo.write('from benchmark_reader_edited import select_files\n')
      # I don't know why it was set like this; by changing it we get all lexicalisations in ru instead of just 3
      elif re.search('^            for lex in entry\.lexs\[1::2\]:  # take every second lexicalisation, i\.e\. only ru\n', line):
        fo.write('            for lex in entry.lexs:\n')
      # elif re.search("^        with open\(f'reference", line):
      #   fo.write("        with open(f'reference{str(j)}', 'w+') as f:\n")
      elif re.search(landmark, line):
        stop = 'yes'
        fo.write(new_code)
      else:
        fo.write(line)

# evaluation code
path_eval = '/content/GenerationEval/eval.py'
eval_code_lines = codecs.open(path_eval, 'r', 'utf-8').readlines()
fo = codecs.open('/content/GenerationEval/eval_edited.py', 'w', 'utf-8')

for line in eval_code_lines:
  # comment sys.argv bc used later in a different way
  if re.search('^sys\.argv = ', line):
    fo.write('#sys.argv = sys.argv[:1]\n')
  # adapt paths to Colab repo
  elif re.search('^BLEU_PATH = ', line):
    fo.write("BLEU_PATH = '/content/GenerationEval/metrics/multi-bleu-detok.perl'\n")
  elif re.search('^METEOR_PATH = ', line):
    fo.write("METEOR_PATH = '/content/GenerationEval/metrics/meteor-1.5/meteor-1.5.jar'\n")
  elif re.search('^def bleurt', line):
    fo.write('def bleurt(references, hypothesis, num_refs, checkpoint = "/content/GenerationEval/metrics/bleurt/bleurt-base-128"):\n')
  # bleurt throws an error if arguments are not flagged (no positional arguments accepted)
  elif re.search('^    scores = scorer\.score\(refs, cands\)', line):
    fo.write('    scores = scorer.score(references=refs, candidates=cands)\n')
  # argParser needs arguments passed explicitly to run on Colab
  elif re.search('^    args = argParser.parse_args\(\)', line):
    fo.write('    args = argParser.parse_args(sys.argv[1:])\n')
  # add one decimal to the scores to replicate exactly the WebNLG+ results
  elif re.search("^        values.append\(round\(result\['bleu_nltk'\], 2\)\)", line):
    fo.write("        values.append(round(result['bleu_nltk'], 3))\n")
  elif re.search("^        values.append\(round\(result\['meteor'\], 2\)\)", line):
    fo.write("        values.append(round(result['meteor'], 3))\n")
  elif re.search("^        values.append\(round\(result\['chrf\+\+'\], 2\)\)", line):
    fo.write("        values.append(round(result['chrf++'], 3))\n")
  elif re.search("^        values.append\(round\(result\['ter'\], 2\)\)", line):
    fo.write("        values.append(round(result['ter'], 3))\n")
  elif re.search("^        values.append\(round\(result\['bert_precision'\], 2\)\)", line):
    fo.write("        values.append(round(result['bert_precision'], 3))\n")
  elif re.search("^        values.append\(round\(result\['bert_recall'\], 2\)\)", line):
    fo.write("        values.append(round(result['bert_recall'], 3))\n")
  elif re.search("^        values.append\(round\(result\['bert_f1'\], 2\)\)", line):
    fo.write("        values.append(round(result['bert_f1'], 3))\n")
  elif re.search("^        values.append\(round\(result\['bleurt'\], 2\)\)", line):
    fo.write("        values.append(round(result['bleurt'], 3))\n")
  else:
    fo.write(line)

fo.close()

# reference text creation code
path_reader = '/content/corpus-reader/benchmark_reader.py'
path_ref_generator = '/content/corpus-reader/generate_references.py'
reader_code_lines = codecs.open(path_reader, 'r', 'utf-8').readlines()
ref_generator_code_lines = codecs.open(path_ref_generator, 'r', 'utf-8').readlines()
fo2 = codecs.open('/content/corpus-reader/benchmark_reader_edited.py', 'w', 'utf-8')
fo3 = codecs.open('/content/corpus-reader/generate_references_edited.py', 'w', 'utf-8')

# I needed to modify the function that gathers files to process, as I didn't manage to make it work for test data otherwise
new_code_reader = "def select_files(topdir, category='', size=(1, 8)):\n    finalfiles = []\n    if topdir.endswith('dev') or topdir.endswith('train'):\n        finaldirs = [topdir+'/'+str(item)+'triples' for item in range(size[0], size[1])]\n        for item in finaldirs:\n            finalfiles += [(item, filename) for filename in sorted(listdir(item)) if category in filename and '.xml' in filename]\n    else:\n        finalfiles += [(topdir, filename) for filename in sorted(listdir(topdir)) if 'generation-test-data-with-refs' in filename and '.xml' in filename]\n    return finalfiles"
# On this file I only changed the paths
new_code_ref_generator = "path = '/content/webnlg-dataset/release_v3.0/en/test'\nrun_on_corpus_per_lang(path, 'en')\n# Russian\npath = '/content/webnlg-dataset/release_v3.0/ru/test'\nrun_on_corpus_per_lang(path, 'ru')"

edit_file(reader_code_lines, '^def select_files', fo2, new_code_reader)
edit_file(ref_generator_code_lines, "^path = '\./challenge2020_train_dev_v2", fo3, new_code_ref_generator)

fo2.close()
fo3.close()


In [None]:
#@title 3 -  Create reference files
# RUN THIS CELL THIRD
# Create reference files for automatic evaluation

import shutil

data_to_process_en = '/content/GenerationEval/data_to_process/en/references'
data_to_process_ru = '/content/GenerationEval/data_to_process/ru/references'
data_to_process_small = '/content/GenerationEval/data_small/en/references'

def postproc_and_move(ref_file_path, dest_folder):
  short_filename = ref_file_path.split('-')[0]
  out_path = os.path.join(dest_folder, short_filename)
  fo = codecs.open(out_path, 'w', 'utf-8')
  file_lines = codecs.open(ref_file_path, 'r', 'utf-8').readlines()
  x = 0
  while x < len(file_lines) - 1:
    fo.write(file_lines[x])
    x = x + 1
  # remove last linebreak of the file (creates problems with some metrics)
  last_line = file_lines[x].split('\n')[0]
  fo.write(last_line)
  fo.close()

if not os.path.exists(data_to_process_en):
  os.makedirs(data_to_process_en)
if not os.path.exists(data_to_process_ru):
  os.makedirs(data_to_process_ru)
if not os.path.exists(data_to_process_small):
  os.makedirs(data_to_process_small)

# Create references for the test data
path_create_refs = '/content/corpus-reader/generate_references_edited.py'
! python {path_create_refs}

# Move references to the folder they'll be used from
postproc_and_move('reference0-en.txt', data_to_process_en)
postproc_and_move('reference1-en.txt', data_to_process_en)
postproc_and_move('reference2-en.txt', data_to_process_en)
postproc_and_move('reference3-en.txt', data_to_process_en)
postproc_and_move('reference4-en.txt', data_to_process_en)
postproc_and_move('reference0-ru.txt', data_to_process_ru)
postproc_and_move('reference1-ru.txt', data_to_process_ru)
postproc_and_move('reference2-ru.txt', data_to_process_ru)
postproc_and_move('reference3-ru.txt', data_to_process_ru)
postproc_and_move('reference4-ru.txt', data_to_process_ru)
postproc_and_move('reference5-ru.txt', data_to_process_ru)
postproc_and_move('reference6-ru.txt', data_to_process_ru)

# Cleanup
! rm 'reference0-en.txt'
! rm 'reference1-en.txt'
! rm 'reference2-en.txt'
! rm 'reference3-en.txt'
! rm 'reference4-en.txt'
! rm 'reference0-ru.txt'
! rm 'reference1-ru.txt'
! rm 'reference2-ru.txt'
! rm 'reference3-ru.txt'
! rm 'reference4-ru.txt'
! rm 'reference5-ru.txt'
! rm 'reference6-ru.txt'

# Create references with 10 sentences to test the metrics quickly
data_ref0 = codecs.open('/content/GenerationEval/data/en/references/reference0', 'r', 'utf-8').readlines()
data_ref1 = codecs.open('/content/GenerationEval/data/en/references/reference1', 'r', 'utf-8').readlines()
data_ref2 = codecs.open('/content/GenerationEval/data/en/references/reference2', 'r', 'utf-8').readlines()
data_ref3 = codecs.open('/content/GenerationEval/data/en/references/reference3', 'r', 'utf-8').readlines()
data_hyp = codecs.open('/content/GenerationEval/data/en/hypothesis', 'r', 'utf-8').readlines()
fo0 = codecs.open('/content/GenerationEval/data_small/en/references/reference0', 'w', 'utf-8')
fo1 = codecs.open('/content/GenerationEval/data_small/en/references/reference1', 'w', 'utf-8')
fo2 = codecs.open('/content/GenerationEval/data_small/en/references/reference2', 'w', 'utf-8')
fo3 = codecs.open('/content/GenerationEval/data_small/en/references/reference3', 'w', 'utf-8')
fo4 = codecs.open('/content/GenerationEval/data_small/en/hypothesis', 'w', 'utf-8')
x = 0
while x < 9:
  fo0.write(data_ref0[x])
  fo1.write(data_ref1[x])
  fo2.write(data_ref2[x])
  fo3.write(data_ref3[x])
  fo4.write(data_hyp[x])
  x = x +1
fo0.write(data_ref0[x].split('\n')[0])
fo1.write(data_ref1[x].split('\n')[0])
fo2.write(data_ref2[x].split('\n')[0])
fo3.write(data_ref3[x].split('\n')[0])
fo4.write(data_hyp[x].split('\n')[0])
fo0.close()
fo1.close()
fo2.close()
fo3.close()
fo4.close()

In [None]:
#@title 4 - Run evaluation (collect results in log_eval folder)
import glob
import os
lang = "en"#@param ["en", "ru"]
#metrics = 'bleu,meteor,ter,chrf++,bert,bleurt'#@param
metrics = 'bleu,meteor,chrf++,bert'#@param
# small_test = 'yes' if want to test quickly the code with files that have 10 examples.
# Expected output (en):
#   BLEU    BLEU NLTK    METEOR    chrF++    TER    BERT-SCORE P    BERT-SCORE R    BERT-SCORE F1    BLEURT
# ------  -----------  --------  --------  -----  --------------  --------------  ---------------  --------
#  73.02        0.726     0.578     0.877  0.194           0.978           0.971            0.974      0.81
small_test = 'no'#@param ["yes", "no"]
log_folder = os.path.join('/content', 'log_eval')
if not os.path.exists(log_folder):
  os.makedirs(log_folder)

num_refs = ''
if lang == 'en':
  num_refs = '4'
elif lang == 'ru':
  num_refs = '6'

# metrics = 'bleu,meteor,ter,chrf++,bert,bleurt'
# Approximate times below on English dataset (1779 texts) with no GPU
# Very fast: BLEU, CHRF++ (~15 sec)
# Fast: METEOR (~40 sec)
# Slow: TER (~10 min, can't use GPU)
# Very slow (Need GPU): BertScore, Bleurt (~1h+ or fail without GPU)
path_code_eval = '/content/GenerationEval/eval_edited.py'
path_hyp_uploaded = '/content/hypotheses/'
if small_test == 'yes':
  path_ref_eval = '/content/GenerationEval/data_small/en/references/reference'
  path_hyp_eval = '/content/GenerationEval/data_small/en/hypothesis'
  log_small = os.path.join(log_folder, 'log_eval_small.txt')
  ! python {path_code_eval} -R {path_ref_eval} -H {path_hyp_eval} -m {metrics} | tee {log_small}
else:
  path_ref_eval = '/content/GenerationEval/data_to_process/'+lang+'/references/reference'
  path_hyp_eval = '/content/GenerationEval/data_to_process/'+lang+'/hypothesis'
  for Filepath in glob.glob(os.path.join(path_hyp_uploaded, '*.txt')):
    print(f'\nProcessing {Filepath}...')
    log_filename = os.path.join(log_folder, 'log_eval_'+Filepath.rsplit('/',1)[1].split('.')[0]+'.txt')
    file_pred_lines = codecs.open(Filepath, 'r', 'utf-8').readlines()
    fo = codecs.open(path_hyp_eval, 'w', 'utf-8')
    for line in file_pred_lines:
      fo.write(line)
    fo.close()
    ! python {path_code_eval} -R {path_ref_eval} -H {path_hyp_eval} -lng {lang} -nr {num_refs} -m {metrics} | tee {log_filename}

# argParser.add_argument("-R", "--reference", help="reference translation", required=True)
# argParser.add_argument("-H", "--hypothesis", help="hypothesis translation", required=True)
# argParser.add_argument("-lng", "--language", help="evaluated language", default='en')
# argParser.add_argument("-nr", "--num_refs", help="number of references", type=int, default=4)
# argParser.add_argument("-m", "--metrics", help="evaluation metrics to be computed", default='bleu,meteor,ter,chrf++,bert,bleurt')
# argParser.add_argument("-nc", "--ncorder", help="chrF metric: character n-gram order (default=6)", type=int, default=6)
# argParser.add_argument("-nw", "--nworder", help="chrF metric: word n-gram order (default=2)", type=int, default=2)
# argParser.add_argument("-b", "--beta", help="chrF metric: beta parameter (default=2)", type=float, default=2.0)


In [None]:
#@title 5 - Zip and download log files
download_log_files = 'yes'#@param['yes', 'no']

from IPython.display import clear_output
import locale
locale.getpreferredencoding = lambda: "UTF-8"

if download_log_files == 'yes':
  from google.colab import files
  zip_name_log = '/content/log_eval.zip'
  !zip -r {zip_name_log} {log_folder}

  clear_output()

  files.download(zip_name_log)