<a href="https://colab.research.google.com/github/markaaronslater/NMT/blob/master/unit_testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# environment for running unit tests, observing model outputs, etc.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!pip install subword-nmt # for segmenting words into subwords
!pip install stanza # for tokenizing corpus and tagging with morphological data
!pip install sacremoses # for detokenizing model predictions
!pip install sacrebleu # for evaluation
!pip install pytest

In [None]:
!nvidia-smi

In [None]:
# recommended: place cloned NMT folder in Google drive folder 'My Drive':
path = '/content/gdrive/My Drive/NMT/'
#corpus_path = path + 'corpuses/iwslt16_en_de/'
corpus_path = path + 'corpuses/toy_corpuses/'

config_path = path + 'configs/'
data_path = path + 'data/'
checkpoint_path = path + 'checkpoints/'

model_name = 'my_model' # name of model tensor batches, hyperparameters, etc., saved as pickle file inside data_path

In [None]:
# %cd /content/gdrive/My Drive/

In [None]:
# from NMT.src.preprocessing.apply_stanza_processors import apply_stanza_processors, retrieve_stanza_outputs
# from NMT.src.preprocessing.corpus_utils import read_corpuses, print_corpuses, print_processed_corpuses
# from NMT.src.preprocessing.truecase import truecase_corpuses
# from NMT.src.import_configs import import_configs
# from NMT.src.preprocessing.preprocess import construct_model_data, retrieve_model_data
# from NMT.src.train import train


In [None]:
%cd /content/gdrive/My Drive/NMT

from src.preprocessing.apply_stanza_processors import apply_stanza_processors, retrieve_stanza_outputs
from src.preprocessing.corpus_utils import read_corpuses, print_corpuses, print_processed_corpuses
from src.preprocessing.truecase import truecase_corpuses
from src.import_configs import import_configs
from src.preprocessing.preprocess import construct_model_data, retrieve_model_data
from src.train import train

In [None]:
!python -m pytest unittests/test_batches.py

In [None]:
!python -m pytest -s -v unittests/test_model.py::test_default_word_model

In [None]:
!python -m pytest -s -v unittests/test_model.py::test_default_subword_model

In [None]:
# before running this cell, ensure using cpu
# (Runtime -> Change runtime type -> Hardware accelerator = None)
!python -m pytest -s -v unittests/test_model.py::test_default_word_model_cpu

In [None]:

corpuses = read_corpuses("train.de", "train.en", "dev.de", "dev.en", "test.de", path=corpus_path, prefix='', _start=1, num=5)
print_corpuses(corpuses, num=5)


In [None]:
# step 1
apply_stanza_processors("train.de", "train.en", "dev.de", "dev.en", "test.de", path=corpus_path, _start=1, num=10)
corpuses = retrieve_stanza_outputs("train.de", "train.en", "dev.de", "dev.en", "test.de", path=corpus_path)

print_processed_corpuses(corpuses)

In [None]:
# step 2
truecase_corpuses("train.de", "train.en", "dev.de", "dev.en", "test.de", path=corpus_path)
corpuses = read_corpuses("train.de", "train.en", "dev.de", "dev.en", "test.de", path=corpus_path, prefix='word_')
print_corpuses(corpuses)



In [None]:
# step 3
hyperparams = import_configs(config_path=config_path)
for hp in hyperparams:
   print(f"{hp}: {hyperparams[hp]}")

In [None]:
# step 4
subword_corpus_path = '/content/gdrive/My\ Drive/NMT/corpuses/iwslt16_en_de/'
num_merge_ops = 1000 # for unit testing, overwrite to smaller values
vocab_threshold = 2
!bash ./NMT/src/preprocessing/subword_joint.sh 1000 2 '/content/gdrive/My Drive/NMT/corpuses/iwslt16_en_de/'
#!bash ./NMT/src/preprocessing/subword_joint.sh $num_merge_ops $vocab_threshold $subword_corpus_path

In [None]:
# corpuses, ref_corpuses = read_tokenized_corpuses("train.de", "train.en", "dev.de", "dev.en", "test.de", path='/content/gdrive/My Drive/NMT/corpuses/iwslt16_en_de/', prefix='word_')
# print_corpuses(corpuses)
# print_corpuses(ref_corpuses)



In [None]:
# toy corpuses
corpus_path = path + 'corpuses/toy_corpuses/'

corpuses = read_corpuses("train.de", "train.en", path=corpus_path, prefix='')
print_corpuses(corpuses)

apply_stanza_processors("train.de", "train.en", path=corpus_path)
corpuses = retrieve_stanza_outputs("train.de", "train.en", path=corpus_path)

print_processed_corpuses(corpuses)

truecase_corpuses("train.de", "train.en", path=corpus_path)
corpuses = read_corpuses("train.de", "train.en", path=corpus_path, prefix='word_')
print_corpuses(corpuses)


In [None]:
# ensure batches of tensors constructed correctly.


%load_ext autoreload
%autoreload 2

# recommended: place cloned NMT folder in Google drive folder 'My Drive':
path = '/content/gdrive/My Drive/NMT/'
#corpus_path = path + 'corpuses/iwslt16_en_de/'
corpus_path = path + 'corpuses/toy_corpuses/'

config_path = path + 'configs/'
data_path = path + 'data/'
checkpoint_path = path + 'checkpoints/'

model_name = 'my_model' # name of model tensor batches, hyperparameters, etc., saved as pickle file inside data_path


%cd /content/gdrive/My Drive/

import torch

from NMT.src.import_configs import import_configs
from NMT.src.preprocessing.preprocess import construct_model_data, retrieve_model_data
from NMT.src.train import train
from NMT.src.preprocessing.corpus_utils import read_tokenized_corpuses

# step 5
hyperparams = import_configs(config_path=config_path)
hyperparams["vocab_type"] = "word"
hyperparams["trim_type"] = "top_k"
hyperparams["src_k"] = 10
hyperparams["trg_k"] = 10
hyperparams["train_bsz"] = 2
hyperparams["dev_bsz"] = 2
hyperparams["decode_slack"] = 30
hyperparams["early_stopping"] = False


vocabs, corpuses, ref_corpuses = construct_model_data("train.de", "train.en", hyperparams=hyperparams,
                     corpus_path=corpus_path, data_path=data_path, model_name=model_name, overfit=True
                    )





# step 6
model_data = retrieve_model_data(data_path=data_path, model_name=model_name)

train_batches = model_data["train_batches"]
dev_batches = model_data["dev_batches"]
test_batches = model_data["test_batches"]
idx_to_trg_word = model_data["idx_to_trg_word"]
ref_corpuses = model_data["ref_corpuses"]
hyperparams = model_data["hyperparams"]
device = hyperparams["device"]


print(f'src vocab:{vocabs["src_word_to_idx"]}')
print(f'trg vocab:{vocabs["trg_word_to_idx"]}')
src_word_to_idx = vocabs["src_word_to_idx"]
trg_word_to_idx = vocabs["trg_word_to_idx"]

# tokenized, truecased toy corpuses:
# train.de:
# [['das', 'ist', 'wahr', '.'], ['mache', 'ich', 'Ja'], ['heute', 'Abend', '!', '!', '!']]

# train.en:
# [['it', "'s", 'true'], ['do', 'I', '?', 'yes'], ['tonight']]}


train_encoder_inputs_in1 = [['das', 'ist', 'wahr', '.'], ['mache', 'ich', 'Ja', '<pad>']]
train_encoder_inputs_in1 = [[src_word_to_idx[word] for word in sent] for sent in train_encoder_inputs_in1]
train_encoder_inputs_in2 = [['heute', 'Abend', '!', '!', '!']]
train_encoder_inputs_in2 = [[src_word_to_idx[word] for word in sent] for sent in train_encoder_inputs_in2]

train_decoder_inputs_in1 = [['<sos>', 'do', 'I', '?', 'yes'], ['<sos>', 'it', "'s", 'true', '<pad>']]
train_decoder_inputs_in1 = [[trg_word_to_idx[word] for word in sent] for sent in train_decoder_inputs_in1]
train_decoder_inputs_in2 = [['<sos>', 'tonight']]
train_decoder_inputs_in2 = [[trg_word_to_idx[word] for word in sent] for sent in train_decoder_inputs_in2]

train_decoder_targets1 = ['do', 'it', 'I', "'s", '?', 'true', 'yes', '<eos>', '<eos>']
train_decoder_targets1 = [trg_word_to_idx[word] for word in train_decoder_targets1]
train_decoder_targets2 = ['tonight', '<eos>']
train_decoder_targets2 = [trg_word_to_idx[word] for word in train_decoder_targets2]

dev_encoder_inputs_in1 = [['heute', 'Abend', '!', '!', '!'], ['das', 'ist', 'wahr', '.', '<pad>']]
dev_encoder_inputs_in1 = [[src_word_to_idx[word] for word in sent] for sent in dev_encoder_inputs_in1]
dev_encoder_inputs_in2 = [['mache', 'ich', 'Ja']]
dev_encoder_inputs_in2 = [[src_word_to_idx[word] for word in sent] for sent in dev_encoder_inputs_in2]


print('\n\n\n\n\n')
### train_batches:
# train batch 1
encoder_inputs, decoder_inputs, decoder_targets = train_batches[0]
assert encoder_inputs['in'].tolist() == train_encoder_inputs_in1
assert torch.all(torch.eq(encoder_inputs['sorted_lengths'], torch.tensor([4, 3], device=device)))
assert torch.all(torch.eq(encoder_inputs['idxs_in_sorted'], torch.tensor([1, 0], device=device)))

assert decoder_inputs['in'].tolist() == train_decoder_inputs_in1
assert torch.all(torch.eq(decoder_inputs['lengths'], torch.tensor([5, 4], device=device)))
assert torch.all(torch.eq(decoder_inputs['mask'], torch.tensor([[[False, False, False,  True]], [[False, False, False, False]]], device=device)))

assert decoder_targets.tolist() == train_decoder_targets1


# train batch 2
encoder_inputs, decoder_inputs, decoder_targets = train_batches[1]
assert encoder_inputs['in'].tolist() == train_encoder_inputs_in2
assert torch.all(torch.eq(encoder_inputs['sorted_lengths'], torch.tensor([5], device=device)))
assert torch.all(torch.eq(encoder_inputs['idxs_in_sorted'], torch.tensor([0], device=device)))

assert decoder_inputs['in'].tolist() == train_decoder_inputs_in2
assert torch.all(torch.eq(decoder_inputs['lengths'], torch.tensor([2], device=device)))
assert torch.all(torch.eq(decoder_inputs['mask'], torch.tensor([[[False, False, False, False, False]]], device=device)))

assert decoder_targets.tolist() == train_decoder_targets2


### dev_batches:
# dev batch 1
encoder_inputs, decoder_inputs, corpus_indices = dev_batches[0]
assert encoder_inputs['in'].tolist() == dev_encoder_inputs_in1
assert torch.all(torch.eq(encoder_inputs['sorted_lengths'], torch.tensor([5, 4], device=device)))
assert torch.all(torch.eq(encoder_inputs['idxs_in_sorted'], torch.tensor([0, 1], device=device)))

assert torch.all(torch.eq(decoder_inputs['mask'], torch.tensor([[[False, False, False, False, False]], [[False, False, False, False,  True]]], device='cuda:0')))
assert decoder_inputs['max_src_len'] == 5

assert torch.all(torch.eq(corpus_indices, torch.tensor([2, 0], device=device)))


# dev batch 2
encoder_inputs, decoder_inputs, corpus_indices = dev_batches[1]
assert encoder_inputs['in'].tolist() == dev_encoder_inputs_in2
assert torch.all(torch.eq(encoder_inputs['sorted_lengths'], torch.tensor([3], device=device)))
assert torch.all(torch.eq(encoder_inputs['idxs_in_sorted'], torch.tensor([0], device=device)))

assert torch.all(torch.eq(decoder_inputs['mask'], torch.tensor([[[False, False, False]]], device=device)))
assert decoder_inputs['max_src_len'] == 3

assert torch.all(torch.eq(corpus_indices, torch.tensor([1], device=device)))






# step 8
dev_references = ref_corpuses["train.en"]
print(dev_references)
reduction = 'sum' # easier to observe loss decrease each epoch
model = train(hyperparams, train_batches, dev_batches, dev_references, idx_to_trg_word, checkpoint_path, save=True, reduction=reduction)









In [None]:
# ensure attention mechanism produces correct result, everything is of correct shape, initial loss is reasonable, and 


In [None]:
[1,2,"hi"] == [1,2,"hi"]



In [None]:
import torch
a = torch.arange(1,10).cuda().view(3,3)
a[2,0] = 20
a.tolist()

In [None]:
a.dim()

In [None]:
torch.argmax(a, 1, keepdim=True)

In [None]:

import sacrebleu
refs = [['The dog bit the man.', 'It was not unexpected.', 'The man bit him first.'],
        ['The dog had bit the man.', 'No one was surprised.', 'The man had bitten the dog.']]
sys = ['The dog bit the man.', "It wasn't surprising.", 'The man had just bitten him.']
bleu = sacrebleu.corpus_bleu(sys, refs)
print(bleu.score)

In [None]:
refs = [['The dog had bit the man.', 'No one was surprised.', 'The man had bitten the dog.']]
sys = ['The dog bit the man.', "It wasn't surprising.", 'The man had just bitten him.']
bleu = sacrebleu.corpus_bleu(sys, refs)
print(bleu.score)

In [None]:
b = torch.full((1, 1), 5, dtype=torch.long)
b

In [None]:
b.size()

In [None]:
# overfit to first 10 sentences of training set

from NMT.src.preprocessing.apply_stanza_processors import apply_stanza_processors, retrieve_stanza_outputs
from NMT.src.preprocessing.corpus_utils import read_corpuses, print_corpuses, print_processed_corpuses
from NMT.src.preprocessing.truecase import truecase_corpuses



corpus_path = path + 'corpuses/iwslt16_en_de/'

corpuses = read_corpuses("train.de", "train.en", path=corpus_path, prefix='', num=10)
print_corpuses(corpuses)

apply_stanza_processors("train.de", "train.en", path=corpus_path, num=10)
corpuses = retrieve_stanza_outputs("train.de", "train.en", path=corpus_path)
print_processed_corpuses(corpuses)

truecase_corpuses("train.de", "train.en", path=corpus_path)
corpuses = read_corpuses("train.de", "train.en", path=corpus_path, prefix='word_')
print_corpuses(corpuses)


In [None]:
# overfit to first 10 sentences of training set

%load_ext autoreload
%autoreload 2

# recommended: place cloned NMT folder in Google drive folder 'My Drive':
path = '/content/gdrive/My Drive/NMT/'
corpus_path = path + 'corpuses/iwslt16_en_de/'
#corpus_path = path + 'corpuses/toy_corpuses/'

config_path = path + 'configs/'
data_path = path + 'data/'
checkpoint_path = path + 'checkpoints/'

model_name = 'my_model' # name of model tensor batches, hyperparameters, etc., saved as pickle file inside data_path


%cd /content/gdrive/My Drive/

import torch

from NMT.src.import_configs import import_configs
from NMT.src.preprocessing.preprocess import construct_model_data, retrieve_model_data
from NMT.src.train import train
from NMT.src.preprocessing.corpus_utils import read_tokenized_corpuses

# step 5
hyperparams = import_configs(config_path=config_path)
hyperparams["vocab_type"] = "word"
hyperparams["trim_type"] = "top_k"
hyperparams["src_k"] = 200
hyperparams["trg_k"] = 200
hyperparams["train_bsz"] = 10
hyperparams["dev_bsz"] = 10
hyperparams["decode_slack"] = 30
hyperparams["early_stopping"] = False
hyperparams["total_epochs"] = 50
hyperparams["enc_hidden_size"] = 1000
hyperparams["dec_hidden_size"] = 1000



vocabs, corpuses, ref_corpuses = construct_model_data("train.de", "train.en", hyperparams=hyperparams,
                     corpus_path=corpus_path, data_path=data_path, model_name=model_name, overfit=True
                    )


model_data = retrieve_model_data(data_path=data_path, model_name=model_name)

train_batches = model_data["train_batches"]
dev_batches = model_data["dev_batches"]
idx_to_trg_word = model_data["idx_to_trg_word"]
ref_corpuses = model_data["ref_corpuses"]
hyperparams = model_data["hyperparams"]


print(f'src vocab:{vocabs["src_word_to_idx"]}')
print(f'trg vocab:{vocabs["trg_word_to_idx"]}')




dev_references = ref_corpuses["train.en"]
print(dev_references)
#reduction = 'sum' # easier to observe loss decrease each epoch
# may actually impair convergence if takes too big a step...
model, loss = train(hyperparams, train_batches, dev_batches, dev_references, idx_to_trg_word, checkpoint_path, save=False)





In [None]:
# predict train set

from NMT.src.predict import predict
# use beam search instead of greedy search.
model.decoder.set_inference_alg("beam_search")

# change to test_batches
bleu, preds_time, post_time = predict(model, dev_batches, dev_references, idx_to_trg_word, checkpoint_path)
print(round(bleu, 2))

In [None]:
round?

In [None]:
just_finished = torch.tensor([0,1,0,1], dtype=torch.bool)
# entry j is True if seq j finished being translated this timestep.
# obtain indices to extract the sequences that just finished.
print(torch.nonzero(just_finished, as_tuple=True))
print(torch.nonzero(just_finished, as_tuple=False))

print(torch.nonzero(just_finished).squeeze(0))
print(torch.nonzero(just_finished).squeeze(1))


In [None]:
x = torch.tensor([2])
x

In [None]:
x.squeeze()

In [None]:
y = torch.tensor(3)
y

In [None]:
################ now, overfit using subword vocab

# overfit to first 10 sentences of training set

%load_ext autoreload
%autoreload 2

# recommended: place cloned NMT folder in Google drive folder 'My Drive':
path = '/content/gdrive/My Drive/NMT/'
corpus_path = path + 'corpuses/iwslt16_en_de/'
config_path = path + 'configs/'
data_path = path + 'data/'
checkpoint_path = path + 'checkpoints/'

model_name = 'my_model' # name of model tensor batches, hyperparameters, etc., saved as pickle file inside data_path


%cd /content/gdrive/My Drive/

import torch

from NMT.src.import_configs import import_configs
from NMT.src.preprocessing.preprocess import construct_model_data, retrieve_model_data
from NMT.src.train import train
from NMT.src.preprocessing.corpus_utils import read_tokenized_corpuses

# step 3
hyperparams = import_configs(config_path=config_path)
hyperparams["num_merge_ops"] = 300
hyperparams["vocab_threshold"] = 0
hyperparams["train_bsz"] = 10
hyperparams["dev_bsz"] = 10
hyperparams["decode_slack"] = 30
hyperparams["early_stopping"] = False
hyperparams["total_epochs"] = 50
hyperparams["enc_hidden_size"] = 1000
hyperparams["dec_hidden_size"] = 1000

num_merge_ops = hyperparams["num_merge_ops"]
vocab_threshold = hyperparams["vocab_threshold"]
# print("start")
# !echo $num_merge_ops
# !echo $vocab_threshold
# !echo "$corpus_path"
# print("finish")

# step 4
!bash ./NMT/src/preprocessing/subword_joint.sh $num_merge_ops $vocab_threshold "$corpus_path"


# step 5
vocabs, corpuses, ref_corpuses = construct_model_data("train.de", "train.en", hyperparams=hyperparams,
                     corpus_path=corpus_path, data_path=data_path, model_name=model_name, overfit=True
                    )


model_data = retrieve_model_data(data_path=data_path, model_name=model_name)

train_batches = model_data["train_batches"]
dev_batches = model_data["dev_batches"]
idx_to_trg_word = model_data["idx_to_trg_word"]
ref_corpuses = model_data["ref_corpuses"]
hyperparams = model_data["hyperparams"]


print(f'src vocab:{vocabs["src_word_to_idx"]}')
print(f'trg vocab:{vocabs["trg_word_to_idx"]}')


dev_references = ref_corpuses["train.en"]
print(dev_references)
model = train(hyperparams, train_batches, dev_batches, dev_references, idx_to_trg_word, checkpoint_path, save=False)

In [None]:
hyperparams = import_configs(config_path=config_path)
# overwrite default settings for hyperparams such that conforms to test conditions
hyperparams["vocab_type"] = "word"
hyperparams["trim_type"] = "top_k"
hyperparams["src_k"] = 200 # set large enough such that no <unk> tokens (or else will not achieve BLEU of 100)
hyperparams["trg_k"] = 200
hyperparams["train_bsz"] = 3
hyperparams["dev_bsz"] = 3
hyperparams["decode_slack"] = 30 # set large enough such that can finish predicting each of the 10 target sentences (or else will not achieve BLEU of 100)
hyperparams["early_stopping"] = False # let the loss go down to zero.
hyperparams["total_epochs"] = 50
hyperparams["enc_hidden_size"] = 1000 # ensure model is of sufficient capacity
hyperparams["dec_hidden_size"] = 1000
hyperparams["enc_dropout"] = 0 # ensure regularization turned off
hyperparams["dec_dropout"] = 0
hyperparams["L2_reg"] = 0