<a href="https://colab.research.google.com/github/markaaronslater/NMT/blob/master/playground.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!pip install subword-nmt # for segmenting words into subwords
!pip install stanza # for tokenizing corpus and tagging with morphological data
!pip install sacremoses # for detokenizing model predictions
!pip install sacrebleu # for evaluation

In [None]:
# make sure using GPU
# (Runtime -> Change runtime type -> Hardware accelerator = GPU).
!nvidia-smi

In [None]:
%cd /content/gdrive/My Drive/NMT

In [6]:
from src.model_utils import load_pretrained
from src.translate import translate
from src.predict import predict
from src.evaluate import evaluate
from src.preprocessing.corpus_utils import read_corpus, get_references

import stanza
from subword_nmt.apply_bpe import BPE

In [8]:
# recommended path to project root directory: place cloned NMT folder in 'My Drive' folder of Google Drive account:
path = '/content/gdrive/My Drive/NMT/'
model_name = 'relu2/' # name of pre-trained model to load
checkpoint_path = path + 'checkpoints/' + model_name
#corpus_path = path + 'corpuses/iwslt16_en_de/subword_segmented/'
corpus_path = path + 'data/iwslt/en-de/subword_segmented/'

translator, model_data = load_pretrained(checkpoint_path=checkpoint_path)
src_word_to_idx = model_data["src_word_to_idx"]
idx_to_trg_word = model_data["idx_to_trg_word"]


In [None]:
stanza.download(lang='de', processors='tokenize,mwt,pos')
stanza_de_processor = stanza.Pipeline(lang='de', processors='tokenize,mwt,pos', tokenize_no_ssplit=True, tokenize_batch_size=64, mwt_batch_size=200, pos_batch_size=10000)

In [9]:
bpe = BPE(open(corpus_path + 'bpe_codes', 'r'), vocab=set(src_word_to_idx))

In [None]:
# a) translation playground:
# place any number of whatever German sentences you want as strings inside following list:
input = ["Dies ist ein deutscher Beispielsatz. Wird es richtig übersetzt?", 
         "Wenn nicht, wird diese Demo nicht sehr beeindruckend sein ...",
         "Ich empfehle, dass Sie zuerst einen englischen Satz erstellen und ihn dann mit Google Translate in Deutsch konvertieren."]

# determined via Google Translate:
sample_targets = ["This is a sample German sentence. Will it be translated correctly?",
                  "If not, then this demo will not be very impressive...",
                  "I recommend that you first come up with an English sentence, and then use Google Translate to convert it to German."]

translations = translate(input, stanza_de_processor, translator, src_word_to_idx, idx_to_trg_word, bpe, device='cuda:0', bsz=8)
for translation in translations:
    print(translation)

In [None]:
# optional - if targets are available, evaluate via BLEU metric:
print(evaluate(translations, [sample_targets]))

In [None]:
# b) replicate BLEU score on test set
### can observe predictions inside <checkpoint_path>/beam_preds.txt
test_path = path + 'data/iwslt/en-de/'
test_set = read_corpus('test.de', path=test_path)
test_references = get_references(path=test_path, dev=False)
translations = translate(test_set, stanza_de_processor, translator, src_word_to_idx, idx_to_trg_word, bpe, checkpoint_path)
print(evaluate(translations, test_references))