<a href="https://colab.research.google.com/github/markaaronslater/NMT/blob/master/NMT_driver.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!pip install subword-nmt # for segmenting words into subwords
!pip install stanza # for tokenizing corpus and tagging with morphological data
!pip install sacremoses # for detokenizing model predictions
!pip install sacrebleu # for evaluation

In [None]:
# make sure using GPU
# (Runtime -> Change runtime type -> Hardware accelerator = GPU).
!nvidia-smi

In [None]:
# recommended: place cloned NMT folder in Google drive folder 'My Drive':
path = '/content/gdrive/My Drive/NMT/'
#corpus_path = path + 'corpuses/iwslt16_en_de/'
corpus_path = path + 'data/iwslt/en-de/'
config_path = path + 'configs/'

### REQUIRED: ###
# create a folder inside of checkpoints, named <model_name>. this will hold all checkpoints for the model, its per-epoch training stats,
# and files holding its greedy dev set predictions after each epoch.
model_name = 'no_bridge_AdamW/' # your model name here. remember to ensure desired settings are set in config files of NMT/configs/ !!!
checkpoint_path = path + 'checkpoints/' + model_name

In [None]:
%cd /content/gdrive/My Drive/NMT

In [None]:
from src.preprocessing.apply_stanza_processors import apply_stanza_processors
from src.preprocessing.truecase import truecase_corpuses
from src.preprocessing.preprocess import construct_model_data
from src.preprocessing.corpus_utils import read_corpus, get_references
from src.import_configs import import_configs
from src.train import train, load_checkpoint
from src.predict import predict
from src.evaluate import evaluate
from src.model_utils import load_pretrained
from src.translate import translate
import stanza
from subword_nmt.apply_bpe import BPE

In [None]:
# step 1 - tokenize corpuses, and tag with morphological data.
#apply_stanza_processors("train.de", "train.en", "dev.de", "dev.en", path=corpus_path)
apply_stanza_processors("dev.en", path=corpus_path)


In [None]:
# step 2 - true-case corpuses using linguistic heuristics that leverage morphological
# data produced by morphological data tagger.
#truecase_corpuses("train.de", "train.en", "dev.de", "dev.en", corpus_path=corpus_path)
truecase_corpuses("dev.en", corpus_path=corpus_path)

In [None]:
# import vocab, training, and model hyperparameter settings from configuration files.
hyperparams = import_configs(config_path=config_path)
print(hyperparams)

In [None]:
# step 3 - segment words of corpuses into subwords (skip this cell if using a word-level vocabulary).
num_merge_ops = hyperparams["num_merge_ops"]
vocab_threshold = hyperparams["vocab_threshold"]
truecased_path = corpus_path + 'truecased/'
segmented_path = corpus_path + 'subword_segmented/'

!bash ./src/preprocessing/subword_joint.sh $num_merge_ops $vocab_threshold "$truecased_path" "$segmented_path"

In [None]:
# step 4 - build intelligently batched sets of tensors that can be directly passed to model.
#construct_model_data("train.de", "train.en", "dev.de", hyperparams=hyperparams, corpus_path=corpus_path+'subword_segmented_more_val/', reference_path=corpus_path, checkpoint_path=checkpoint_path)
construct_model_data("train.de", "train.en", "dev.de", "dev.en", hyperparams=hyperparams, corpus_path=corpus_path+'subword_segmented/', reference_path=corpus_path, checkpoint_path=checkpoint_path)


In [None]:
# step 5 - instantiate and train model.
model, loss = train(checkpoint_path=checkpoint_path)

In [None]:
# prepare pretrained model for end-to-end inference.
stanza.download(lang='de', processors='tokenize,mwt,pos')
stanza_de_processor = stanza.Pipeline(lang='de', processors='tokenize,mwt,pos', tokenize_no_ssplit=True, tokenize_batch_size=64, mwt_batch_size=200, pos_batch_size=10000)

In [None]:
# step 6 - evaluate test set predictions.
translator, model_data = load_pretrained(checkpoint_path)
src_word_to_idx = model_data["src_word_to_idx"]
idx_to_trg_word = model_data["idx_to_trg_word"]

# optional: first observe beam search predictions of best model on dev set:
# (during training, used greedy search).
# bleu should improve by ~1.5
dev_batches = model_data["dev_batches"]
dev_references = model_data["references"]

translator.decoder.set_inference_alg("beam_search", 0.0)

dev_translations, preds_time, post_time = predict(translator, dev_batches, idx_to_trg_word, checkpoint_path)
bleu = evaluate(dev_translations, dev_references)
print(round(bleu, 2))
print(preds_time)




# evaluate model on test set.
bpe = BPE(open(corpus_path + 'subword_segmented_more_val/bpe_codes', 'r'), vocab=set(src_word_to_idx))
test_path = path + 'data/iwslt/en-de/'
test_set = read_corpus('test.de', path=test_path)
test_references = get_references(path=test_path, dev=False)
translations = translate(test_set, stanza_de_processor, translator, src_word_to_idx, idx_to_trg_word, bpe)
print(evaluate(translations, test_references))



In [None]:
for beta in [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]:
  translator.decoder.set_inference_alg("beam_search", beta)
  translations = translate(test_set, stanza_de_processor, translator, src_word_to_idx, idx_to_trg_word, bpe)
  bleu = evaluate(translations, test_references)
  print(f"beta: {beta}, bleu: {bleu}")

In [None]:
# b) run unit tests to show correctness of model implementations
# can run each separately, or discover and run all at once (see below)

In [None]:
# allow ~5 min to run all model variant tests, each of which trains for 100 epochs.
!python -m pytest

In [None]:
!python -m pytest unittests/test_batches.py # ensure intelligent batching procedure is correct

In [None]:
!python -m pytest -s -v unittests/test_model.py::test_default_word_model

In [None]:
!python -m pytest -s -v unittests/test_model.py::test_default_subword_model