# OpenNMT-py Middle & Modern

Sequence-to-Sequence Encoder-Decoder Models for translating Middle and Modern English


In [89]:
from google.colab import drive

# default location for the drive
ROOT = "/content/gdrive"

drive.mount(ROOT)

# Check that can access the shared drive
!ls "{ROOT}/Shareddrives/CS 175 Project"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
'AElfric to Albert User Evaluation Study.gform'
 Datasets
 Diagrams
 models
 Notebooks
'Papers Other Resources'
'Progress Reports'
 Proposal
'Team AElfrictoAlbert.gsheet'
 token.txt
'User Evaluation Study Key.gdoc'
'Yanqi making it repo public.gdoc'


In [90]:
# Clone github repository setup
# import join used to join ROOT path and MY_GOOGLE_DRIVE_PATH
from os.path import join  

# path to your project on Google Drive
MY_GOOGLE_DRIVE_PATH = 'My Drive/cs175-Aelfric-to-Albert' 
GIT_USERNAME = "mayaschwarz" 

# Put your Token here! Do not save to the repo with it!
GIT_TOKEN_PATH = join(ROOT, "Shareddrives/CS 175 Project/token.txt")
GIT_TOKEN = ""

with open(GIT_TOKEN_PATH, 'r') as f:
  GIT_TOKEN = f.readline().strip()

if not GIT_TOKEN:
  raise ValueError("GIT_TOKEN MISSING")

GIT_REPOSITORY = "cs175--lfric-to-Albert" 

PROJECT_PATH = join(ROOT, MY_GOOGLE_DRIVE_PATH)

# It's good to print out the value if you are not sure 
print("PROJECT_PATH: ", PROJECT_PATH)   

#GIT_PATH = "https://{GIT_TOKEN}@github.com/{GIT_USERNAME}/{GIT_REPOSITORY}.git" this return 400 Bad Request for me
GIT_PATH = "https://" + GIT_TOKEN + "@github.com/" + GIT_USERNAME + "/" + GIT_REPOSITORY + ".git"
print("GIT_PATH: ", GIT_PATH)

PROJECT_PATH:  /content/gdrive/My Drive/cs175-Aelfric-to-Albert
GIT_PATH:  https://5724b257c777c6dbb9bc086821f822ef220e3126@github.com/mayaschwarz/cs175--lfric-to-Albert.git


In [None]:
# Answer input query for downloading git repository
while True:
    response = input("Are you sure you want to download the repo? Doing so will delete all unpush work. [y|N] ").lower().strip()
    if not response or response[0] == 'n':
        break
    elif response[0] == "y":
        !rm -rv "{PROJECT_PATH}"
        !mkdir -p "{PROJECT_PATH}" 
        !git clone "{GIT_PATH}" "{PROJECT_PATH}"
        break

# cd into the repository
%cd "{PROJECT_PATH}"

Are you sure you want to download the repo? Doing so will delete all unpush work. [y|N] N
/content/gdrive/My Drive/cs175-Aelfric-to-Albert


In [91]:
# Check that repository is up to date
!git pull 
# Check which branch you're on
!git branch

There is no tracking information for the current branch.
Please specify which branch you want to merge with.
See git-pull(1) for details.

    git pull <remote> <branch>

If you wish to set tracking information for this branch you can do so with:

    git branch --set-upstream-to=origin/<branch> middle-and-modern-lstm

  main[m
* [32mmiddle-and-modern-lstm[m
  old-to-modern-lstm[m


## Setting up the Python Environment


In [None]:
# On Google Colab ONLY
# Reinstall Torch to avoid incompatibility with Cuda 10.1

# NOTE: By the end of the insatallation, it might ask for restarting the runtime...
# In this case, just click the "RESTART RUNTIME" button.
!pip install --ignore-installed torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
# install the rest of the packages if needed
!pip install nltk==3.5 pyyaml==5.3.1 torchvision==0.7.0 cltk contractions OpenNMT-py sacrebleu tensorboard

In [None]:
# load notebook environment variables
%load_ext tensorboard

In [None]:
# standard library
import math
from os import listdir
import re
import random

# additional libraries (pip install ..)
import cltk
import nltk
import onmt
from onmt.utils.misc import set_random_seed
import pyonmttok
import torch
import torch.nn as nn
from torchtext.data import Dataset
import yaml

# local libraries
from src.data_manager import *
from src.paths import *

In [None]:
# For METEOR Evaluation
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
def set_deterministic(seed: int = 1234):
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    set_random_seed(seed, torch.cuda.is_available())

set_deterministic()

## Preprocessing and Tokenization


In [None]:
from cltk.corpus.middle_english.alphabet import normalize_middle_english
from cltk.phonology.old_english.phonology import Word
from typing import Union

def _normalize(text: str, language_code: str):
    if language_code == 'ang':
        # old english
        DONT_NORMALIZE = '!?.&,:;"'
        normalized_words = list()
        for word in text.split():
            if len(word) == 0:
                continue

            if word[-1] in DONT_NORMALIZE:
                normalized_words.append(Word(word[:-1]).ascii_encoding() + word[-1])
            else:
                normalized_words.append(Word(word).ascii_encoding())

        return ' '.join(normalized_words)
    elif language_code == 'enm':
        # middle english
        return normalize_middle_english(text, to_lower=False, alpha_conv=True)
    return text

def tokenizer(text: str, language_code: str, **kwargs: bool) -> [str]:
    tok = pyonmttok.Tokenizer("aggressive", joiner_annotate=True, **kwargs)
    tokens, _ = tok.tokenize(_normalize(text, language_code))
    return tokens

def write_tokenized_dataset(dataset: {str: [str]}, source: str, source_language_code: str, target: str, target_language_code: str, file_paths: {str, Union[str, Path], Union[str, Path]}, token_kwargs: {str: bool} = {}) -> None:
    """
    Given a dataset, tokenizes and writes the contents according to it's file path

    Arguments:
      dataset {{str: [str]}} -- dataset returned from create_datasets
      file_paths - dictionary with key as the dataset-type (training, validation, test), item as (path to source, path to target)
      token_kwargs {{str: bool}} -- kwargs for the tokenizer (case_markup, etc.)
    """
    for dataset_t in file_paths.keys():
        src_path, tgt_path = file_paths[dataset_t]
        with open(src_path, mode='w+', encoding='utf-8') as src, open(tgt_path, mode='w+', encoding='utf-8') as tgt:
            src.write('\n'.join([" ".join(tokenizer(l, source_language_code, **token_kwargs)) for l in dataset[dataset_t][source]]))
            tgt.write('\n'.join([" ".join(tokenizer(l, target_language_code, **token_kwargs)) for l in dataset[dataset_t][target]]))

# Training

In [None]:
# Check if GPU is active
# If not, go to "Runtime" menu > "Change runtime type" > "GPU"

!nvidia-smi -L

GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-f52d38a7-d665-47f6-163b-621df6d6278e)


In [None]:
# Make sure the GPU is visable to PyTorch
import torch

gpu_id = torch.cuda.current_device()
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(gpu_id))

True
Tesla P100-PCIE-16GB


In [None]:
def build_and_train(config_path):
    # build and store vocab in run folder
    !onmt_build_vocab -config "{config_path}" -n_sample -1
    # begin training
    !onmt_train -config "{config_path}"

# Translation and Evaluation

See [here](https://opennmt.net/OpenNMT-py/options/translate.html) for more info on translation parameters

Evaluatation using BLEU and METEOR

In [None]:
from sacrebleu import corpus_bleu
from nltk.translate.meteor_score import meteor_score

def calculate_meteor_score(reference: [str], hypothesis: [str]) -> float:
    score = 0.0
    for r, h in zip(reference, hypothesis):
        score += meteor_score(r, h)
    return score / len(hypothesis)

def calculate_bleu_score(reference: [[str]], hypothesis: [str]) -> float:
    bleu = corpus_bleu(hypothesis, reference)
    return bleu.score

In [None]:
def evaluate(model_paths: [str], source_path: Union[str, Path], target_path: Union[str, Path], max_length: int, beam_size: 5, token_kwargs: {str:bool}, save_folder='./predictions') -> ([([str], float, float)], [str]):
    tok = pyonmttok.Tokenizer("aggressive", **token_kwargs)
    
    # retrieve and detokenize the reference
    # (this ensures that any normalization techniques used do not effect the scoring)
    ref = []
    with open(f"{target_path}", encoding='utf-8') as f:
        ref = [tok.detokenize(line.rstrip('\n').split(' ')) for line in f]
  
    scores = []
    
    for m in model_paths:
        m_name = m.name[:-3] if isinstance(m, Path) else m.rsplit('(\\|\/)')[-1][:-3]
        file_path = f"{save_folder}/{m_name}_pred.txt"
        # Call the translate script to generate token predictions
        !onmt_translate -model "{m}" -src "{source_path}" -output "{file_path}" -gpu 0 -beam_size "{beam_size}" -max_length "{max_length}"
        
        hyp = []
        with open(file_path, encoding='utf8') as f:
            hyp = [tok.detokenize(line.rstrip('\n').split(' ')) for line in f]
            if hyp[-1] == '':
              hyp = hyp[:-1]
        # get the bleu score
        bleu = calculate_bleu_score([ref], hyp)
        # get the meteor score
        # meteor = calculate_meteor_score(ref, hyp)
        scores.append((hyp, bleu, 0.0))
        print(f'{m} \t BLEU={bleu:.4f}')

    return scores, ref

# Configuring the Data, Model, and Training Parameters
Generate a YAML file that contains all the hyperparameters and system variables necessary to build the vocab, build, and train the model.

See [here](https://opennmt.net/OpenNMT-py/options/build_vocab.html) for more info on building vocab

See [here](https://opennmt.net/OpenNMT-py/options/train.html) for more info about building the model and training parameters

In [None]:
# declare the config folder to store all the yaml files
CONFIG_NAME = 'openmt-config'
!mkdir -p "{CONFIG_NAME}"
CONFIG_PATH = Path(CONFIG_NAME)

## Middle and Modern English

### Middle to Modern


In [None]:
from pathlib import Path

ENM2MOD_TRANSLATE_NAME = 'enm2mod'
!mkdir -p '{ENM2MOD_TRANSLATE_NAME}'

# PATH VARIABLES
ENM2MOD_TRANSLATE_PATH = Path(ENM2MOD_TRANSLATE_NAME)
ENM2MOD_RUN_PATH = ENM2MOD_TRANSLATE_PATH / 'run'
!mkdir -p "{ENM2MOD_RUN_PATH}"

# Dataset Variables
ENM2MOD_SOURCE_VER = 't_wyc'
ENM2MOD_SRC_LANG_CODE = 'enm'
ENM2MOD_TARGET_VER = 't_kjv'
ENM2MOD_TGT_LANG_CODE = 'eng'

MAX_SENTENCE_LENGTH = 60

# Dataset Paths
DATA_PATH = Path('data/preprocessed')
!mkdir -p "{DATA_PATH}"

In [None]:
# Generate splits and write to files
versions = get_bible_versions_by_file_name([ENM2MOD_SOURCE_VER, ENM2MOD_TARGET_VER])

datasets = create_datasets(versions, .82, 
                preprocess_operations = [preprocess_filter_num_words(MAX_SENTENCE_LENGTH),
                                         preprocess_expand_contractions(),
                                         preprocess_filter_num_sentences(),
                ]);

Finding shared verses between 2 versions...        done in 0.303 seconds
Run preprocess operations...                       done in 0.870 seconds
Separate test verses...                            done in 0.012 seconds
Separate validation verses...                      done in 0.020 seconds
Zip together verses (shuffle = True)...            done in 0.031 seconds

# verses before preprocessing:  28,514
# verses after  preprocessing:  22,398 (79%)


# training verses:    15,513 (69%)
# validation verses:   3,406 (15%)
# test verses:         3,479 (16%)


In [None]:
ENM2MOD_SRC_EXT = ENM2MOD_SOURCE_VER[2:]
ENM2MOD_TGT_EXT = ENM2MOD_TARGET_VER[2:]


enm2mod_file_paths = {
    'training' : (DATA_PATH / f'bible-train.{ENM2MOD_SRC_EXT}', DATA_PATH / f'bible-train.{ENM2MOD_TGT_EXT}'),
    'validation' : (DATA_PATH / f'bible-valid.{ENM2MOD_SRC_EXT}', DATA_PATH / f'bible-valid.{ENM2MOD_TGT_EXT}'),
    'test' : (DATA_PATH / f'bible-test.{ENM2MOD_SRC_EXT}', DATA_PATH / f'bible-test.{ENM2MOD_TGT_EXT}')
    }

token_kwargs = {
    'case_markup': True
    }

In [None]:
write_tokenized_dataset(datasets, ENM2MOD_SOURCE_VER, ENM2MOD_SRC_LANG_CODE, ENM2MOD_TARGET_VER, ENM2MOD_TGT_LANG_CODE, enm2mod_file_paths, token_kwargs)

In [None]:
ENM2MOD_SRC_VOCAB_PATH = ENM2MOD_RUN_PATH / 'vocab.src'
ENM2MOD_TGT_VOCAB_PATH = ENM2MOD_RUN_PATH / 'vocab.tgt'

enm2mod_yaml = 'enm2mod.yaml'

ENM2MOD_MODEL_PATH = ENM2MOD_RUN_PATH / 'models'
ENM2MOD_MODEL_PREFIX = 'enm2mod'

In [None]:
config =  f'''# {enm2mod_yaml}
save_data: {ENM2MOD_RUN_PATH}

### DATA PROPROCESSING ###
## Where the vocab(s) will be written
src_vocab: {ENM2MOD_SRC_VOCAB_PATH}
tgt_vocab: {ENM2MOD_TGT_VOCAB_PATH}

# Corpus opts:
data:
    homilies:
        path_src: {enm2mod_file_paths['training'][0]}
        path_tgt: {enm2mod_file_paths['training'][1]}
        transforms: []
        weight: 1
    valid:
        path_src: {enm2mod_file_paths['validation'][0]}
        path_tgt: {enm2mod_file_paths['validation'][1]}
        transforms: []

## silently ignore empty lines in data
skip_empty_level: silent

### TRAINING ###
## Where the model will be saved
save_model: {ENM2MOD_MODEL_PATH / ENM2MOD_MODEL_PREFIX}
save_checkpoint_steps: 1000
average_decay: 0.0005
seed: 1234
report_every: 100
train_steps: 100000
valid_steps: 100
early_stopping: 10
early_stopping_criteria: accuracy
tensorboard: True
tensorboard_log_dir: {ENM2MOD_RUN_PATH / 'logs'}

# Batching
world_size: 1
gpu_ranks: [0]
batch_size: 64
valid_batch_size: 64
batch_size_multiple: 1

# Optimization
model_dtype: "fp32"
optim: "adam"
learning_rate: 0.001

# Model
encoder_type: rnn
decoder_type: rnn
rnn_type: LSTM
enc_layers: 2
dec_layers: 2
rnn_size: 512
word_vec_size: 256
'''

with open(CONFIG_PATH / enm2mod_yaml, "w+") as config_yaml:
  config_yaml.write(config)

In [None]:
build_and_train(CONFIG_PATH / enm2mod_yaml)

[2021-03-04 03:19:46,729 INFO] Counter vocab from -1 samples.
[2021-03-04 03:19:46,729 INFO] n_sample=-1: Build vocab on full datasets.
[2021-03-04 03:19:46,736 INFO] homilies's transforms: TransformPipe()
[2021-03-04 03:19:46,738 INFO] Loading ParallelCorpus(data/preprocessed/bible-train.wyc, data/preprocessed/bible-train.kjv, align=None)...
[2021-03-04 03:19:47,138 INFO] Counters src:13530
[2021-03-04 03:19:47,139 INFO] Counters tgt:9853
[2021-03-04 03:19:48,172 INFO] Parsed 2 corpora from -data.
[2021-03-04 03:19:48,173 INFO] Get special vocabs from Transforms: {'src': set(), 'tgt': set()}.
[2021-03-04 03:19:48,173 INFO] Loading vocab from text file...
[2021-03-04 03:19:48,173 INFO] Loading src vocabulary from enm2mod/run/vocab.src
[2021-03-04 03:19:48,196 INFO] Loaded src vocab has 13530 tokens.
[2021-03-04 03:19:48,201 INFO] Loading tgt vocabulary from enm2mod/run/vocab.tgt
[2021-03-04 03:19:48,238 INFO] Loaded tgt vocab has 9853 tokens.
[2021-03-04 03:19:48,242 INFO] Building fie

In [None]:
# retrieve the models
enm2mod_model_paths = [ ENM2MOD_MODEL_PATH / f for f in listdir(ENM2MOD_MODEL_PATH) if f.startswith(ENM2MOD_MODEL_PREFIX)]

ENM2MOD_PREDICTIONS_PATH = ENM2MOD_RUN_PATH / 'predictions'
!mkdir -p "{ENM2MOD_PREDICTIONS_PATH}"

enm2mod_scores, _ = evaluate(enm2mod_model_paths, enm2mod_file_paths['test'][0], enm2mod_file_paths['test'][1], MAX_SENTENCE_LENGTH, 10, token_kwargs, ENM2MOD_PREDICTIONS_PATH)

[2021-03-04 03:30:27,494 INFO] Translating shard 0.
[2021-03-04 03:30:54,879 INFO] PRED AVG SCORE: -0.8870, PRED PPL: 2.4279
enm2mod/run/models/enm2mod_step_1000.pt 	 BLEU=17.2577
[2021-03-04 03:30:59,571 INFO] Translating shard 0.
[2021-03-04 03:31:26,516 INFO] PRED AVG SCORE: -0.6079, PRED PPL: 1.8365
enm2mod/run/models/enm2mod_step_2000.pt 	 BLEU=22.7357
[2021-03-04 03:31:31,199 INFO] Translating shard 0.
enm2mod/run/models/enm2mod_step_3000.pt 	 BLEU=23.4319
[2021-03-04 03:32:03,042 INFO] Translating shard 0.
[2021-03-04 03:32:30,027 INFO] PRED AVG SCORE: -0.4346, PRED PPL: 1.5443
enm2mod/run/models/enm2mod_step_4000.pt 	 BLEU=23.5675
[2021-03-04 03:32:34,729 INFO] Translating shard 0.
[2021-03-04 03:33:01,824 INFO] PRED AVG SCORE: -0.4309, PRED PPL: 1.5387
enm2mod/run/models/enm2mod_step_4100.pt 	 BLEU=23.3729


Using BLEU Scoring, the best performing model is after 4000 training iterations with a BLEU score of ~ 23.57




### Modern to Middle

We can reuse the preprocessing files saved from the previous model

In [None]:
MOD2ENM_TRANSLATE_NAME = 'mod2enm'
!mkdir -p '{MOD2ENM_TRANSLATE_NAME}'

# PATH VARIABLES
MOD2ENM_TRANSLATE_PATH = Path(MOD2ENM_TRANSLATE_NAME)
MOD2ENM_RUN_PATH = MOD2ENM_TRANSLATE_PATH / 'run'
!mkdir -p "{MOD2ENM_RUN_PATH}"

# Dataset Variables (swap previous run)
MOD2ENM_SOURCE_VER = 't_kjv'
MOD2ENM_TARGET_VER = 't_wyc'

In [None]:
MOD2ENM_SRC_EXT = MOD2ENM_SOURCE_VER[2:]
MOD2ENM_TGT_EXT = MOD2ENM_TARGET_VER[2:]

mod2enm_file_paths = {
    'training' : (DATA_PATH / f'bible-train.{MOD2ENM_SRC_EXT}', DATA_PATH / f'bible-train.{MOD2ENM_TGT_EXT}'),
    'validation' : (DATA_PATH / f'bible-valid.{MOD2ENM_SRC_EXT}', DATA_PATH / f'bible-valid.{MOD2ENM_TGT_EXT}'),
    'test' : (DATA_PATH / f'bible-test.{MOD2ENM_SRC_EXT}', DATA_PATH / f'bible-test.{MOD2ENM_TGT_EXT}')
    }

token_kwargs = {
    'case_markup': True
    }

# datasets are already tokenized by the first run, no need to do again
# write_tokenized_dataset(datasets, file_paths, token_kwargs)

In [None]:
MOD2ENM_SRC_VOCAB_PATH = MOD2ENM_RUN_PATH / 'vocab.src'
MOD2ENM_TGT_VOCAB_PATH = MOD2ENM_RUN_PATH / 'vocab.tgt'

mod2enm_yaml = 'mod2enm.yaml'

MOD2ENM_MODEL_PATH = MOD2ENM_RUN_PATH / 'models'
MOD2ENM_MODEL_PREFIX = 'mod2enm'

In [None]:
config =  f'''# {mod2enm_yaml}
save_data: {MOD2ENM_RUN_PATH}

### DATA PROPROCESSING ###
## Where the vocab(s) will be written
src_vocab: {MOD2ENM_SRC_VOCAB_PATH}
tgt_vocab: {MOD2ENM_TGT_VOCAB_PATH}

# Corpus opts:
data:
    homilies:
        path_src: {mod2enm_file_paths['training'][0]}
        path_tgt: {mod2enm_file_paths['training'][1]}
        transforms: []
        weight: 1
    valid:
        path_src: {mod2enm_file_paths['validation'][0]}
        path_tgt: {mod2enm_file_paths['validation'][1]}
        transforms: []

## silently ignore empty lines in data
skip_empty_level: silent

### TRAINING ###
## Where the model will be saved
save_model: {MOD2ENM_MODEL_PATH / MOD2ENM_MODEL_PREFIX}
save_checkpoint_steps: 1000
average_decay: 0.0005
seed: 1234
report_every: 100
train_steps: 100000
valid_steps: 100
early_stopping: 10
early_stopping_criteria: accuracy
tensorboard: True
tensorboard_log_dir: {MOD2ENM_RUN_PATH / 'logs'}

# Batching
world_size: 1
gpu_ranks: [0]
batch_size: 64
valid_batch_size: 64
batch_size_multiple: 1

# Optimization
model_dtype: "fp32"
optim: "adam"
learning_rate: 0.001

# Model
encoder_type: rnn
decoder_type: rnn
rnn_type: LSTM
enc_layers: 2
dec_layers: 2
rnn_size: 512
word_vec_size: 256
'''

with open(CONFIG_PATH / mod2enm_yaml, "w+") as config_yaml:
  config_yaml.write(config)

In [None]:
build_and_train(CONFIG_PATH / mod2enm_yaml)

[2021-03-04 03:33:04,455 INFO] Counter vocab from -1 samples.
[2021-03-04 03:33:04,455 INFO] n_sample=-1: Build vocab on full datasets.
[2021-03-04 03:33:04,462 INFO] homilies's transforms: TransformPipe()
[2021-03-04 03:33:04,463 INFO] Loading ParallelCorpus(data/preprocessed/bible-train.kjv, data/preprocessed/bible-train.wyc, align=None)...
[2021-03-04 03:33:04,864 INFO] Counters src:9853
[2021-03-04 03:33:04,864 INFO] Counters tgt:13530
[2021-03-04 03:33:05,862 INFO] Parsed 2 corpora from -data.
[2021-03-04 03:33:05,863 INFO] Get special vocabs from Transforms: {'src': set(), 'tgt': set()}.
[2021-03-04 03:33:05,863 INFO] Loading vocab from text file...
[2021-03-04 03:33:05,863 INFO] Loading src vocabulary from mod2enm/run/vocab.src
[2021-03-04 03:33:05,880 INFO] Loaded src vocab has 9853 tokens.
[2021-03-04 03:33:05,883 INFO] Loading tgt vocabulary from mod2enm/run/vocab.tgt
[2021-03-04 03:33:05,925 INFO] Loaded tgt vocab has 13530 tokens.
[2021-03-04 03:33:05,931 INFO] Building fie

In [None]:
# retrieve the models
mod2enm_model_paths = [ MOD2ENM_MODEL_PATH / f for f in listdir(MOD2ENM_MODEL_PATH) if f.startswith(MOD2ENM_MODEL_PREFIX)]

MOD2ENM_PREDICTIONS_PATH = MOD2ENM_RUN_PATH / 'predictions'
!mkdir -p "{MOD2ENM_PREDICTIONS_PATH}"

mod2enm_scores, _ = evaluate(mod2enm_model_paths, mod2enm_file_paths['test'][0], mod2enm_file_paths['test'][1], MAX_SENTENCE_LENGTH, 10, token_kwargs, MOD2ENM_PREDICTIONS_PATH)

[2021-03-04 03:43:25,540 INFO] Translating shard 0.
mod2enm/run/models/mod2enm_step_1000.pt 	 BLEU=16.3994
[2021-03-04 03:44:00,074 INFO] Translating shard 0.
[2021-03-04 03:44:29,053 INFO] PRED AVG SCORE: -0.6420, PRED PPL: 1.9002
mod2enm/run/models/mod2enm_step_2000.pt 	 BLEU=22.4017
[2021-03-04 03:44:33,608 INFO] Translating shard 0.
[2021-03-04 03:45:02,632 INFO] PRED AVG SCORE: -0.5147, PRED PPL: 1.6732
mod2enm/run/models/mod2enm_step_3000.pt 	 BLEU=23.1465
[2021-03-04 03:45:07,155 INFO] Translating shard 0.
[2021-03-04 03:45:35,704 INFO] PRED AVG SCORE: -0.4507, PRED PPL: 1.5694
mod2enm/run/models/mod2enm_step_4000.pt 	 BLEU=22.6068
[2021-03-04 03:45:40,324 INFO] Translating shard 0.
[2021-03-04 03:46:08,694 INFO] PRED AVG SCORE: -0.4365, PRED PPL: 1.5473
mod2enm/run/models/mod2enm_step_4300.pt 	 BLEU=22.4027


looking at the BLEU scores, our best model is achieved after 3000 epochs with a BLEU score of ~ 23.15