## All the imports we need for translations

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5ForConditionalGeneration, T5Tokenizer

## The first 5 paragraphs from Alice in Wonderland

Source is project guhtenberg https://www.gutenberg.org/cache/epub/11/pg11.txt

In [2]:
englishtexts = ['''Alice was beginning to get very tired of sitting by her sister on the
bank, and of having nothing to do: once or twice she had peeped into
the book her sister was reading, but it had no pictures or
conversations in it, “and what is the use of a book,” thought Alice
“without pictures or conversations?”
''','''So she was considering in her own mind (as well as she could, for the
hot day made her feel very sleepy and stupid), whether the pleasure of
making a daisy-chain would be worth the trouble of getting up and
picking the daisies, when suddenly a White Rabbit with pink eyes ran
close by her.
''','''There was nothing so _very_ remarkable in that; nor did Alice think it
so _very_ much out of the way to hear the Rabbit say to itself, “Oh
dear! Oh dear! I shall be late!” (when she thought it over afterwards,
it occurred to her that she ought to have wondered at this, but at the
time it all seemed quite natural); but when the Rabbit actually _took a
watch out of its waistcoat-pocket_, and looked at it, and then hurried
on, Alice started to her feet, for it flashed across her mind that she
had never before seen a rabbit with either a waistcoat-pocket, or a
watch to take out of it, and burning with curiosity, she ran across the
field after it, and fortunately was just in time to see it pop down a
large rabbit-hole under the hedge.
''','''In another moment down went Alice after it, never once considering how
in the world she was to get out again.
''','''The rabbit-hole went straight on like a tunnel for some way, and then
dipped suddenly down, so suddenly that Alice had not a moment to think
about stopping herself before she found herself falling down a very
deep well.
''','''Either the well was very deep, or she fell very slowly, for she had
plenty of time as she went down to look about her and to wonder what
was going to happen next. First, she tried to look down and make out
what she was coming to, but it was too dark to see anything; then she
looked at the sides of the well, and noticed that they were filled with
cupboards and book-shelves; here and there she saw maps and pictures
hung upon pegs. She took down a jar from one of the shelves as she
passed; it was labelled “ORANGE MARMALADE”, but to her great
disappointment it was empty: she did not like to drop the jar for fear
of killing somebody underneath, so managed to put it into one of the
cupboards as she fell past it.''']

## Pick device if using cuda or not and display

In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)

cuda:0


## Load models for translation

The models total take between 6 and 8 gigs, should work on cards with 8 gigs of ram also runs fine on cpu

In [4]:
model_name = "VietAI/envit5-translation"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) 
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
model_name2 = "NlpHUST/t5-en-vi-small"
model2 = T5ForConditionalGeneration.from_pretrained(model_name2).to(device)
tokenizer2 = T5Tokenizer.from_pretrained(model_name2)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [6]:
model_name3 = "NlpHUST/t5-vi-en-small"
model3 = T5ForConditionalGeneration.from_pretrained(model_name3).to(device)
tokenizer3 = T5Tokenizer.from_pretrained(model_name3)

In [7]:
model_name4 = "vinai/vinai-translate-en2vi-v2"
tokenizer4 = AutoTokenizer.from_pretrained(model_name4, src_lang="en_XX")
model4 = AutoModelForSeq2SeqLM.from_pretrained(model_name4).to(device)

In [8]:
model_name5 = "vinai/vinai-translate-vi2en-v2"
tokenizer5 = AutoTokenizer.from_pretrained(model_name5, src_lang="vi_VN")
model5 = AutoModelForSeq2SeqLM.from_pretrained(model_name5).to(device)

## Dictionary below will hold the translations for each of the models

Code below simply gets all the English text and converts to Vietnamese, the translations are indexed by model names

In [9]:
translated_outputs = {}

In [10]:
print(model_name)
translated_outputs[model_name] = []
for englishtext in englishtexts:
    inputs = [f"en: {englishtext}"]
    outputs = model.generate(tokenizer(inputs, return_tensors="pt", padding=True).input_ids.to(device), max_length=512)
    output = [output for output in tokenizer.batch_decode(outputs, skip_special_tokens=True)][0].split('vi: ')[1]
    translated_outputs[model_name].append(output)

VietAI/envit5-translation


In [11]:
print(model_name2)
translated_outputs[model_name2] = []
for englishtext in englishtexts:
    tokenized_text = tokenizer2.encode(englishtext, return_tensors="pt").to(device)
    model2.eval()
    summary_ids = model2.generate(
                        tokenized_text,
                        max_length=512,
                        num_beams=5,
                        repetition_penalty=2.5,
                        length_penalty=1.0,
                        early_stopping=False
                    )
    output = tokenizer2.decode(summary_ids[0], skip_special_tokens=True)
    translated_outputs[model_name2].append(output)

NlpHUST/t5-en-vi-small


In [12]:
print(model_name4)
translated_outputs[model_name4] = []
for englishtext in englishtexts:
    input_ids = tokenizer4([englishtext], padding=True, return_tensors="pt").to(device)
    output_ids = model4.generate(
            **input_ids,
            decoder_start_token_id=tokenizer4.lang_code_to_id["vi_VN"],
            num_return_sequences=1,
            num_beams=5,
            early_stopping=True
        )
    vi_texts = tokenizer4.batch_decode(output_ids, skip_special_tokens=True)
    translated_outputs[model_name4].append(vi_texts[0])

vinai/vinai-translate-en2vi-v2


## Take the translations to vietnamese and have each of the models translate those back into English

In [13]:
vietnamese_translations = {}

In [14]:
print(model_name3)
vietnamese_translations[model_name3] = {}
for key in translated_outputs:
    vietnamese_translations[model_name3][key] = []
    for input_text in translated_outputs[key]:
        tokenized_text = tokenizer3.encode(input_text, return_tensors="pt").to(device)
        model3.eval()
        summary_ids = model3.generate(
                        tokenized_text,
                        max_length=256,
                        num_beams=5,
                        repetition_penalty=2.5,
                        length_penalty=1.0,
                        early_stopping=False
                    )
        output2 = tokenizer3.decode(summary_ids[0], skip_special_tokens=True)
        vietnamese_translations[model_name3][key].append(output2)


NlpHUST/t5-vi-en-small


In [15]:
print(model_name)
vietnamese_translations[model_name] = {}
for key in translated_outputs:
    vietnamese_translations[model_name][key] = []
    for input_text in translated_outputs[key]:
        outputs = model.generate(tokenizer([f'vi: {input_text}'], return_tensors="pt", padding=True).input_ids.to(device), max_length=512)
        test = [output for output in tokenizer.batch_decode(outputs, skip_special_tokens=True)][0].split('en: ')[1]
        vietnamese_translations[model_name][key].append(test)

VietAI/envit5-translation


In [16]:
print(model_name5)
vietnamese_translations[model_name5] = {}
for key in translated_outputs:
    vietnamese_translations[model_name5][key] = []
    for input_text in translated_outputs[key]:
        input_ids = tokenizer5([input_text], padding=True, return_tensors="pt").to(device)
        output_ids = model5.generate(
            **input_ids,
            decoder_start_token_id=tokenizer5.lang_code_to_id["en_XX"],
            num_return_sequences=1,
            num_beams=5,
            early_stopping=False
        )
        en_texts = tokenizer5.batch_decode(output_ids, skip_special_tokens=True)
        vietnamese_translations[model_name5][key].append(en_texts[0])
    

vinai/vinai-translate-vi2en-v2


## Dump all the results to json in data directory so can use the data later

In [17]:
from json import dump, load
data = {"original":englishtexts, "vietnamese":translated_outputs, "english":vietnamese_translations, "mappings":{model_name:model_name,model_name3:model_name2,model_name5:model_name4}}
data_path = "data/translations.json"
with open(data_path,"w") as outfile:
    dump(data,outfile,indent=4)

## Create an html document with a table of all the translations for analysis

In [18]:
from string import Template
html_template = Template("<html><header><title>$title</title></header><body>$body</body></html>")
table_template = Template("<table><th>en2vi model</th><th>original</th><th>vi translation</th><th>$model1</th><th>$model2</th><th>$model3</th>$rows</table>")
vien_models = [x for x in data['vietnamese'].keys()]
row_template = Template("<tr><td><b>$en2vi</b></td><td>$original</td><td>$translation</td><td>$v1</td><td>$v2</td><td>$v3</td></tr>")
rows = ''
for key in data['english']:
    for ndx in range(len(data["original"])):
        original = data["original"][ndx]
        translation = data["vietnamese"][data["mappings"][key]][ndx]
        model1 = data["english"][key][vien_models[0]][ndx]
        model2 = data["english"][key][vien_models[1]][ndx]
        model3 = data["english"][key][vien_models[2]][ndx]
        row = row_template.substitute(en2vi=key,original=original,translation=translation,v1=model1,v2=model2,v3=model3)
        rows += row
body = table_template.substitute(model1=vien_models[0],model2=vien_models[1],model3=vien_models[2],rows=rows)
html = html_template.substitute(title="Evaluate",body=body)

## Have to write the string to file in utf-8 format, currently it is utf-16

In [19]:
import codecs
data_path = "data/index.html"
with codecs.open(data_path,"w",'utf-8') as outfile:
    outfile.write(html)