<a href="https://colab.research.google.com/github/markaaronslater/NMT/blob/master/unit_testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# environment for running unit tests, observing model outputs, etc.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
from NMT.src.preprocessing.apply_stanza_processors import apply_stanza_processors, retrieve_stanza_outputs
from NMT.src.preprocessing.corpus_utils import read_corpuses, print_corpuses, print_processed_corpuses
from NMT.src.preprocessing.truecase import truecase_corpuses
from NMT.src.import_configs import import_configs
from NMT.src.preprocessing.preprocess import construct_model_data, retrieve_model_data
from NMT.src.train import train


In [None]:
!pip install subword-nmt # for segmenting words into subwords
!pip install stanza # for tokenizing corpus and tagging with morphological data
!pip install sacrebleu # for evaluation
!git clone https://github.com/moses-smt/mosesdecoder.git # for detokenizing model outputs prior to evaluation

In [None]:
!nvidia-smi

In [None]:
# recommended: place cloned NMT folder in Google drive folder 'My Drive':
path = '/content/gdrive/My Drive/NMT/'
corpus_path = path + 'corpuses/iwslt16_en_de/'
config_path = path + 'configs/'
data_path = path + 'data/'
checkpoint_path = path + 'checkpoints/'

model_name = 'my_model' # name of model tensor batches, hyperparameters, etc., saved as pickle file inside data_path

In [None]:
%cd /content/gdrive/My Drive/

In [None]:

corpuses = read_corpuses("train.de", "train.en", "dev.de", "dev.en", "test.de", path=corpus_path, prefix='', _start=1, num=5)
print_corpuses(corpuses, num=5)


In [None]:
# step 1
apply_stanza_processors("train.de", "train.en", "dev.de", "dev.en", "test.de", path=corpus_path, _start=1, num=5)
corpuses = retrieve_stanza_outputs("train.de", "train.en", "dev.de", "dev.en", "test.de", path=corpus_path)

print_processed_corpuses(corpuses, num=5)

In [None]:
# step 2
truecase_corpuses("train.de", "train.en", "dev.de", "dev.en", "test.de", path=corpus_path)
corpuses = read_corpuses("train.de", "train.en", "dev.de", "dev.en", "test.de", path=corpus_path, prefix='word_')
print_corpuses(corpuses)



In [None]:
# step 3
hyperparams = import_configs(config_path=config_path)
for hp in hyperparams:
   print(f"{hp}: {hyperparams[hp]}")

In [None]:
# step 4
subword_corpus_path = '/content/gdrive/My\ Drive/NMT/corpuses/iwslt16_en_de/'
num_merge_ops = 1000 # for unit testing, overwrite to smaller values
vocab_threshold = 2
!bash ./NMT/src/preprocessing/subword_joint.sh 1000 2 '/content/gdrive/My Drive/NMT/corpuses/iwslt16_en_de/'
#!bash ./NMT/src/preprocessing/subword_joint.sh $num_merge_ops $vocab_threshold $subword_corpus_path

In [None]:
# corpuses, ref_corpuses = read_tokenized_corpuses("train.de", "train.en", "dev.de", "dev.en", "test.de", path='/content/gdrive/My Drive/NMT/corpuses/iwslt16_en_de/', prefix='word_')
# print_corpuses(corpuses)
# print_corpuses(ref_corpuses)



In [None]:
# step 5
hyperparams = import_configs(config_path=config_path)
hyperparams["vocab_type"] = "word"
hyperparams["trim_type"] = "top_k"
hyperparams["src_k"] = 50
hyperparams["trg_k"] = 50
vocabs, corpuses, ref_corpuses = construct_model_data("train.de", "train.en", "dev.de", "dev.en", "test.de", hyperparams=hyperparams,
                     corpus_path=corpus_path, data_path=data_path, model_name=model_name
                    )

# step 6
model_data = retrieve_model_data(data_path=data_path, model_name=model_name)

train_batches = model_data["train_batches"]
dev_batches = model_data["dev_batches"]
test_batches = model_data["test_batches"]
idx_to_trg_word = model_data["idx_to_trg_word"]
ref_corpuses = model_data["ref_corpuses"]
hyperparams = model_data["hyperparams"]
print('\n\n\n\n\n')

# print(vocabs)
# print('\n\n\n\n\n')

# print(corpuses)
# print('\n\n\n\n\n')

# print(train_batches)
# print('\n\n\n\n\n')

# print(dev_batches)
# print('\n\n\n\n\n')

# print(test_batches)
# print('\n\n\n\n\n')

# print(idx_to_trg_word)
# print('\n\n\n\n\n')

print(ref_corpuses)
print('\n\n\n\n\n')

# print(hyperparams)
# print('\n\n\n\n\n')




In [None]:
# step 8
# overfit on training set of 5 sentences
references = ref_corpuses["train.en"]
model = train(hyperparams, train_batches, dev_batches, references, idx_to_trg_word, checkpoint_path, save=True)