<a href="https://colab.research.google.com/github/markaaronslater/NMT/blob/master/NMT_driver.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!pip install subword-nmt # for segmenting words into subwords
!pip install stanza # for tokenizing corpus and tagging with morphological data
!pip install sacremoses # for detokenizing model predictions
!pip install sacrebleu # for evaluation

In [None]:
# make sure using GPU
# (Runtime -> Change runtime type -> Hardware accelerator = GPU).
!nvidia-smi

In [None]:
# recommended: place cloned NMT folder in Google drive folder 'My Drive':
path = '/content/gdrive/My Drive/NMT/'
corpus_path = path + 'corpuses/iwslt16_en_de/'
config_path = path + 'configs/'
# give model a name representing, e.g., major hyperparameter setting differences from other models, etc.
model_name = 'my_model/' # name of model tensor batches, hyperparameters, etc., saved as pickle file inside data_path
checkpoint_path = path + 'checkpoints/' + model_name


In [None]:
%cd /content/gdrive/My Drive/NMT

In [None]:
from src.preprocessing.apply_stanza_processors import apply_stanza_processors
from src.preprocessing.truecase import truecase_corpuses
from src.import_configs import import_configs
from src.preprocessing.preprocess import construct_model_data, retrieve_model_data
from src.train import train, load_checkpoint
from src.predict import predict
from src.evaluate import evaluate

In [None]:
# only meaningful for unit tests on subsets of corpus data, where _start is starting line number,
# (using 1-based indexing) and num is how many lines to extract. if num is None, then extract all lines from _start till end of corpus.
# _start = 1
# num = None
# num = 10 # uncomment this line if unit testing

In [None]:
# step 1 - tokenize corpuses, and tag with morphological data.
apply_stanza_processors("train.de", "train.en", "dev.de", "test.de", path=corpus_path)

In [None]:
# step 2 - true-case corpuses using linguistic heuristics that leverage morphological
# data produced by morphological data tagger.
truecase_corpuses("train.de", "train.en", "dev.de", "test.de", corpus_path=corpus_path):

In [None]:
# import vocab, training, and model hyperparameter settings from configuration files.
hyperparams = import_configs(config_path=config_path)

In [None]:
# step 3 - segment words of corpuses into subwords (skip this cell if using a word-level vocabulary).
num_merge_ops = hyperparams["num_merge_ops"]
vocab_threshold = hyperparams["vocab_threshold"]
truecased_path = corpus_path + 'truecased/'
segmented_path = corpus_path + 'subword_segmented/'

!bash ./src/preprocessing/subword_joint.sh $num_merge_ops $vocab_threshold "$truecased_path" "$segmented_path"

In [None]:
p = corpus_path + 'subword_segmented/' # point p to the preprocessed corpuses to be directly used by model 

# step 4 - build intelligently batched sets of tensors that can be directly passed to model.
train_batches, dev_batches, vocabs, hyperparams = construct_model_data("train.de", "train.en", "dev.de", "test.de",
                                          hyperparams=hyperparams, corpus_path=p, checkpoint_path=checkpoint_path
                                                                                                  )

In [None]:
# step 5 - instantiate and train model.

model, loss = train(hyperparams, train_batches, dev_batches, dev_references, idx_to_trg_word, checkpoint_path, save=True)

In [None]:
model = model[0]
print(model)

In [None]:
# step 5 - evaluate test set predictions.

# # can load a checkpoint rather than using prev cell's model:
# if hyperparams["early_stopping"]:
#     model = load_checkpoint(hyperparams, checkpoint_path, "best_model")
# else:
#     model, _ = load_checkpoint(hyperparams, checkpoint_path, "most_recent_model")

# use beam search instead of greedy search.
model.decoder.set_inference_alg("beam_search")

# change to test_batches
# get test batches


# (can read predictions inside checkpoints/beam_preds.txt)
dev_translations, preds_time, post_time = predict(model, dev_batches, idx_to_trg_word, checkpoint_path)
bleu = evaluate(dev_translations, dev_references)
print(round(bleu, 2))
print(preds_time)

In [None]:
# b) run unit tests to show correctness of model implementations
# can run each separately, or discover and run all at once (see below)

In [None]:
# allow ~5 min to run all model variant tests, each of which trains for 100 epochs.
!python -m pytest

In [None]:
!python -m pytest unittests/test_batches.py # ensure intelligent batching procedure is correct

In [None]:
!python -m pytest -s -v unittests/test_model.py::test_default_word_model

In [None]:
!python -m pytest -s -v unittests/test_model.py::test_default_subword_model

In [None]:
# before running this cell, ensure using cpu
# (Runtime -> Change runtime type -> Hardware accelerator = None).
# allow several minutes for this test to run.
!python -m pytest -s -v unittests/test_model.py::test_default_word_model_cpu

In [None]:
!python -m pytest -s -v unittests/test_model.py::test_uni_no_attn

In [None]:
!python -m pytest -s -v unittests/test_model.py::test_layer_to_layer_uni_no_attn

In [None]:
!python -m pytest -s -v unittests/test_model.py::test_final_to_first_uni_no_attn

In [None]:
!python -m pytest -s -v unittests/test_model.py::test_dropout

In [None]:
!python -m pytest -s -v unittests/test_model.py::test_no_tying

In [None]:
!python -m pytest -s -v unittests/test_model.py::test_no_attn_no_tying

In [None]:
# c) load a pre-trained model checkpoint to determine BLEU score on test set.
