<a href="https://colab.research.google.com/github/markaaronslater/NMT/blob/master/unit_testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# environment for running unit tests, observing model outputs, etc.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!pip install subword-nmt # for segmenting words into subwords
!pip install stanza # for tokenizing corpus and tagging with morphological data
!pip install sacremoses # for detokenizing model predictions
!pip install sacrebleu # for evaluation
#!git clone https://github.com/moses-smt/mosesdecoder.git # for detokenizing model outputs prior to evaluation

In [None]:
!nvidia-smi

In [None]:
# recommended: place cloned NMT folder in Google drive folder 'My Drive':
path = '/content/gdrive/My Drive/NMT/'
#corpus_path = path + 'corpuses/iwslt16_en_de/'
corpus_path = path + 'corpuses/toy_corpuses/'

config_path = path + 'configs/'
data_path = path + 'data/'
checkpoint_path = path + 'checkpoints/'

model_name = 'my_model' # name of model tensor batches, hyperparameters, etc., saved as pickle file inside data_path

In [None]:
%cd /content/gdrive/My Drive/

In [None]:
from NMT.src.preprocessing.apply_stanza_processors import apply_stanza_processors, retrieve_stanza_outputs
from NMT.src.preprocessing.corpus_utils import read_corpuses, print_corpuses, print_processed_corpuses
from NMT.src.preprocessing.truecase import truecase_corpuses
from NMT.src.import_configs import import_configs
from NMT.src.preprocessing.preprocess import construct_model_data, retrieve_model_data
from NMT.src.train import train


In [None]:

corpuses = read_corpuses("train.de", "train.en", "dev.de", "dev.en", "test.de", path=corpus_path, prefix='', _start=1, num=5)
print_corpuses(corpuses, num=5)


In [None]:
# step 1
apply_stanza_processors("train.de", "train.en", "dev.de", "dev.en", "test.de", path=corpus_path, _start=1, num=10)
corpuses = retrieve_stanza_outputs("train.de", "train.en", "dev.de", "dev.en", "test.de", path=corpus_path)

print_processed_corpuses(corpuses)

In [None]:
# step 2
truecase_corpuses("train.de", "train.en", "dev.de", "dev.en", "test.de", path=corpus_path)
corpuses = read_corpuses("train.de", "train.en", "dev.de", "dev.en", "test.de", path=corpus_path, prefix='word_')
print_corpuses(corpuses)



In [None]:
# step 3
hyperparams = import_configs(config_path=config_path)
for hp in hyperparams:
   print(f"{hp}: {hyperparams[hp]}")

In [None]:
# step 4
subword_corpus_path = '/content/gdrive/My\ Drive/NMT/corpuses/iwslt16_en_de/'
num_merge_ops = 1000 # for unit testing, overwrite to smaller values
vocab_threshold = 2
!bash ./NMT/src/preprocessing/subword_joint.sh 1000 2 '/content/gdrive/My Drive/NMT/corpuses/iwslt16_en_de/'
#!bash ./NMT/src/preprocessing/subword_joint.sh $num_merge_ops $vocab_threshold $subword_corpus_path

In [None]:
# corpuses, ref_corpuses = read_tokenized_corpuses("train.de", "train.en", "dev.de", "dev.en", "test.de", path='/content/gdrive/My Drive/NMT/corpuses/iwslt16_en_de/', prefix='word_')
# print_corpuses(corpuses)
# print_corpuses(ref_corpuses)



In [None]:
# toy corpuses
corpus_path = path + 'corpuses/toy_corpuses/'

corpuses = read_corpuses("train.de", "train.en", path=corpus_path, prefix='')
print_corpuses(corpuses)

apply_stanza_processors("train.de", "train.en", path=corpus_path)
corpuses = retrieve_stanza_outputs("train.de", "train.en", path=corpus_path)

print_processed_corpuses(corpuses)

truecase_corpuses("train.de", "train.en", path=corpus_path)
corpuses = read_corpuses("train.de", "train.en", path=corpus_path, prefix='word_')
print_corpuses(corpuses)


In [None]:
# ensure batches of tensors constructed correctly.


%load_ext autoreload
%autoreload 2

# recommended: place cloned NMT folder in Google drive folder 'My Drive':
path = '/content/gdrive/My Drive/NMT/'
#corpus_path = path + 'corpuses/iwslt16_en_de/'
corpus_path = path + 'corpuses/toy_corpuses/'

config_path = path + 'configs/'
data_path = path + 'data/'
checkpoint_path = path + 'checkpoints/'

model_name = 'my_model' # name of model tensor batches, hyperparameters, etc., saved as pickle file inside data_path


%cd /content/gdrive/My Drive/

from NMT.src.import_configs import import_configs
from NMT.src.preprocessing.preprocess import construct_model_data, retrieve_model_data
from NMT.src.train import train

# step 5
hyperparams = import_configs(config_path=config_path)
hyperparams["vocab_type"] = "word"
hyperparams["trim_type"] = "top_k"
hyperparams["src_k"] = 10
hyperparams["trg_k"] = 10
hyperparams["train_bsz"] = 2
hyperparams["dev_bsz"] = 2
hyperparams["decode_slack"] = 30

vocabs, corpuses, ref_corpuses = construct_model_data("train.de", "train.en", hyperparams=hyperparams,
                     corpus_path=corpus_path, data_path=data_path, model_name=model_name, overfit=True
                    )





# step 6
model_data = retrieve_model_data(data_path=data_path, model_name=model_name)

train_batches = model_data["train_batches"]
dev_batches = model_data["dev_batches"]
test_batches = model_data["test_batches"]
idx_to_trg_word = model_data["idx_to_trg_word"]
ref_corpuses = model_data["ref_corpuses"]
hyperparams = model_data["hyperparams"]
device = hyperparams["device"]


print(f'src vocab:{vocabs["src_word_to_idx"]}')
print(f'trg vocab:{vocabs["trg_word_to_idx"]}')


print('\n\n\n\n\n')
### train_batches:
# train batch 1
encoder_inputs, decoder_inputs, decoder_targets = train_batches[0]
assert torch.all(torch.eq(encoder_inputs['in'], torch.tensor([[3, 6, 4, 8], [9, 2, 7, 0]], device=device)))
assert torch.all(torch.eq(encoder_inputs['sorted_lengths'], torch.tensor([4, 3], device=device)))
assert torch.all(torch.eq(encoder_inputs['idxs_in_sorted'], torch.tensor([1, 0], device=device)))

assert torch.all(torch.eq(decoder_inputs['in'], torch.tensor([[ 2,  7,  4,  9,  5], [ 2, 11,  6, 10,  0]], device=device)))
assert torch.all(torch.eq(decoder_inputs['lengths'], torch.tensor([5, 4], device=device)))
assert torch.all(torch.eq(decoder_inputs['mask'], torch.tensor([[[False, False, False,  True]], [[False, False, False, False]]], device=device)))

assert torch.all(torch.eq(decoder_targets, torch.tensor([ 7, 11,  4,  6,  9, 10,  5,  3,  3], device=device)))


# train batch 2
encoder_inputs, decoder_inputs, decoder_targets = train_batches[1]
assert torch.all(torch.eq(encoder_inputs['in'], torch.tensor([[10,  5,  11, 11, 11]], device=device)))
assert torch.all(torch.eq(encoder_inputs['sorted_lengths'], torch.tensor([5], device=device)))
assert torch.all(torch.eq(encoder_inputs['idxs_in_sorted'], torch.tensor([0], device=device)))

assert torch.all(torch.eq(decoder_inputs['in'], torch.tensor([[2, 8]], device=device)))
assert torch.all(torch.eq(decoder_inputs['lengths'], torch.tensor([2], device=device)))
assert torch.all(torch.eq(decoder_inputs['mask'], torch.tensor([[[False, False, False, False, False]]], device=device)))

assert torch.all(torch.eq(decoder_targets, torch.tensor([8, 3], device=device)))


### dev_batches:
# dev batch 1
encoder_inputs, decoder_inputs, corpus_indices = dev_batches[0]
assert torch.all(torch.eq(encoder_inputs['in'], torch.tensor([[10,  5, 11, 11, 11], [ 3,  6,  4,  8,  0]], device=device)))
assert torch.all(torch.eq(encoder_inputs['sorted_lengths'], torch.tensor([5, 4], device=device)))
assert torch.all(torch.eq(encoder_inputs['idxs_in_sorted'], torch.tensor([0, 1], device=device)))

assert torch.all(torch.eq(decoder_inputs['mask'], torch.tensor([[[False, False, False, False, False]], [[False, False, False, False,  True]]], device='cuda:0')))
assert decoder_inputs['max_src_len'] == 5

assert torch.all(torch.eq(corpus_indices, torch.tensor([2, 0], device=device)))


# dev batch 2
encoder_inputs, decoder_inputs, corpus_indices = dev_batches[1]
assert torch.all(torch.eq(encoder_inputs['in'], torch.tensor([[9, 2, 7]], device=device)))
assert torch.all(torch.eq(encoder_inputs['sorted_lengths'], torch.tensor([3], device=device)))
assert torch.all(torch.eq(encoder_inputs['idxs_in_sorted'], torch.tensor([0], device=device)))

assert torch.all(torch.eq(decoder_inputs['mask'], torch.tensor([[[False, False, False]]], device=device)))
assert decoder_inputs['max_src_len'] == 3

assert torch.all(torch.eq(corpus_indices, torch.tensor([1], device=device)))









# print('\n\n\n\n\n')
# print("##################### dev_batches:")
# for i, dev_batch in enumerate(dev_batches):
#     print(f"dev batch {i+1}:")
#     encoder_inputs, decoder_inputs, corpus_indices = dev_batch
#     print("encoder_inputs:")
#     print(f"in: {encoder_inputs['in']}")
#     print(f"sorted_lengths: {encoder_inputs['sorted_lengths']}")
#     print(f"idxs_in_sorted: {encoder_inputs['idxs_in_sorted']}")
#     print('\n\n')
#     print("decoder_inputs:")
#     print(f"mask: {decoder_inputs['mask']}")
#     print(f"max_src_len: {decoder_inputs['max_src_len']}")
#     print('\n\n')
#     print(f"corpus_indices: {corpus_indices}")
#     print('\n\n\n\n')








In [None]:
# ensure attention mechanism produces correct result, everything is of correct shape, initial loss is reasonable, and 


In [None]:


# step 8
dev_references = ref_corpuses["train.en"]
reduction = 'sum' # easier to observe loss decrease each epoch
model = train(hyperparams, train_batches, dev_batches, dev_references, idx_to_trg_word, checkpoint_path, save=True, reduction=reduction)


In [None]:
import torch
a = torch.arange(1,10).cuda().view(3,3)
a[2,0] = 20
a

In [None]:
a.dim()

In [None]:
torch.argmax(a, 1, keepdim=True)

In [None]:

import sacrebleu
refs = [['The dog bit the man.', 'It was not unexpected.', 'The man bit him first.'],
        ['The dog had bit the man.', 'No one was surprised.', 'The man had bitten the dog.']]
sys = ['The dog bit the man.', "It wasn't surprising.", 'The man had just bitten him.']
bleu = sacrebleu.corpus_bleu(sys, refs)
print(bleu.score)

In [None]:
refs = [['The dog had bit the man.', 'No one was surprised.', 'The man had bitten the dog.']]
sys = ['The dog bit the man.', "It wasn't surprising.", 'The man had just bitten him.']
bleu = sacrebleu.corpus_bleu(sys, refs)
print(bleu.score)