In [1]:
import pickle
from pickle import dump

In [2]:
def load_doc(filename):
    file = open(filename, mode = 'rt', encoding = 'utf-8')
    text = file.read()
    file.close()
    return text

def to_sentences(doc):
    return doc.strip().split('\n')

def sentence_lengths(sentences):
    lengths = [len(s.split()) for s in sentences]
    return min(lengths), max(lengths)

In [3]:
import re
import string
import unicodedata
def clean_lines(lines):
    cleaned = list()
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    table = str.maketrans('', '', string.punctuation)
    for line in lines:
        line = unicodedata.normalize('NFD', line).encode('ascii', 'ignore')
        line = line.decode('UTF-8')
        line = line.split()
        line = [word.lower() for word in line]
        line = [word.translate(table) for word in line]
        line = [re_print.sub('', w) for w in line]
        line = [word for word in line if word.isalpha()]
        cleaned.append(' '.join(line))
    return cleaned

In [4]:
import os
filename = os.getenv('HOME') + '/Desktop/dj_study/transformers/content/fr-en/europarl-v7.fr-en.en'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print(f'English data: sentences={len(sentences)}, min = {minlen}, max = {maxlen}')
cleanf = clean_lines(sentences)

English data: sentences=2007723, min = 0, max = 668


In [5]:
eng_filename = os.getenv('HOME') + '/Desktop/dj_study/transformers/content/fr-en/English.pkl'
outfile = open(eng_filename, 'wb')
pickle.dump(cleanf, outfile)
outfile.close()
print(filename, ' saved')

/home/aiffel-dj46/Desktop/dj_study/transformers/content/fr-en/europarl-v7.fr-en.en  saved


In [6]:
filename = os.getenv('HOME') + '/Desktop/dj_study/transformers/content/fr-en/europarl-v7.fr-en.fr'
doc = load_doc(filename)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print(f'French data: sentences={len(sentences)}, min = {minlen}, max = {maxlen}')
cleanf = clean_lines(sentences)
fr_filename = os.getenv('HOME') + '/Desktop/dj_study/transformers/content/fr-en/French.pkl'
outfile = open(fr_filename, 'wb')
pickle.dump(cleanf, outfile)
outfile.close()
print(filename, ' saved')

French data: sentences=2007723, min = 0, max = 693
/home/aiffel-dj46/Desktop/dj_study/transformers/content/fr-en/europarl-v7.fr-en.fr  saved


In [9]:
from pickle import load
from pickle import dump
from collections import Counter

def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

def save_clean_sentences(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print(f'Saved: {filename}')

def to_vocab(lines):
    vocab = Counter()
    for line in lines:
        tokens = line.split()
        vocab.update(tokens)
    return vocab

def trim_vocab(vocab, min_occurance):
    tokens = [k for k, c in vocab.items() if c >= min_occurance]
    return set(tokens)

def update_dataset(lines, vocab):
    new_lines = list()
    for line in lines:
        new_tokens = list()
        for token in line.split():
            if token in vocab:
                new_tokens.append(token)
                
            else:
                new_tokens.append('unk')
        new_line = ' '.join(new_tokens)
        new_lines.append(new_line)
    return new_lines

In [11]:
lines = load_clean_sentences(eng_filename)
vocab = to_vocab(lines)
print(f'Vocabulary: {len(vocab)}')
vocab = trim_vocab(vocab, 5)
print(f'new_vocabulary : {len(vocab)}')
lines = update_dataset(lines, vocab)
filename = os.getenv('HOME') + '/Desktop/dj_study/transformers/content/fr-en/english_vocab.pkl'
save_clean_sentences(lines, filename)
for i in range(20):
    print('line', i, ":", lines[i])

Vocabulary: 105357
new_vocabulary : 41746
Saved: /home/aiffel-dj46/Desktop/dj_study/transformers/content/fr-en/english_vocab.pkl
line 0 : resumption of the session
line 1 : i declare resumed the session of the european parliament adjourned on friday december and i would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period
line 2 : although as you will have seen the dreaded millennium bug failed to materialise still the people in a number of countries suffered a series of natural disasters that truly were dreadful
line 3 : you have requested a debate on this subject in the course of the next few days during this partsession
line 4 : in the meantime i should like to observe a minute s silence as a number of members have requested on behalf of all the victims concerned particularly those of the terrible storms in the various countries of the european union
line 5 : please rise then for this minute s silence
line 6 : the house rose and observe

In [13]:
lines = load_clean_sentences(fr_filename)
vocab = to_vocab(lines)
print(f'Vocabulary: {len(vocab)}')
vocab = trim_vocab(vocab, 5)
print(f'new_vocabulary : {len(vocab)}')
lines = update_dataset(lines, vocab)
filename = os.getenv('HOME') + '/Desktop/dj_study/transformers/content/fr-en/french_vocab.pkl'
save_clean_sentences(lines, filename)
for i in range(20):
    print('line', i, ":", lines[i])

Vocabulary: 141642
new_vocabulary : 58800
Saved: /home/aiffel-dj46/Desktop/dj_study/transformers/content/fr-en/french_vocab.pkl
line 0 : reprise de la session
line 1 : je declare reprise la session du parlement europeen qui avait ete interrompue le vendredi decembre dernier et je vous renouvelle tous mes vux en esperant que vous avez passe de bonnes vacances
line 2 : comme vous avez pu le constater le grand bogue de lan ne sest pas produit en revanche les citoyens dun certain nombre de nos pays ont ete victimes de catastrophes naturelles qui ont vraiment ete terribles
line 3 : vous avez souhaite un debat a ce sujet dans les prochains jours au cours de cette periode de session
line 4 : en attendant je souhaiterais comme un certain nombre de collegues me lont demande que nous observions une minute de silence pour toutes les victimes des tempetes notamment dans les differents pays de lunion europeenne qui ont ete touches
line 5 : je vous invite a vous lever pour cette minute de silence
li

In [14]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

In [15]:
import os
import numpy as np
import trax

In [16]:
model = trax.models.Transformer(
    input_vocab_size = 33300,
    d_model = 512, d_ff = 2048,
    n_heads = 8, n_encoder_layers = 6, n_decoder_layers = 6,
    max_len = 2048, mode = 'predict')

In [18]:
model.init_from_file('gs://trax-ml/models/translation/ende_wmt32k.pkl.gz', weights_only = True)



In [19]:
sentence = 'I am only a machine but I have machine intelligence.'

tokenized = list(trax.data.tokenize(iter([sentence]),  # Operates on streams.
                                    vocab_dir='gs://trax-ml/vocabs/',
                                    vocab_file='ende_32k.subword'))[0]

In [20]:
tokenized = tokenized[None, :]  # Add batch dimension.
tokenized_translation = trax.supervised.decoding.autoregressive_sample(
    model, tokenized, temperature=0.0)  # Higher temperature: more diverse results.

In [21]:
tokenized_translation = tokenized_translation[0][:-1]  # Remove batch and EOS.
translation = trax.data.detokenize(tokenized_translation,
                                   vocab_dir='gs://trax-ml/vocabs/',
                                   vocab_file='ende_32k.subword')
print("The sentence:",sentence)
print("The translation:",translation)

The sentence: I am only a machine but I have machine intelligence.
The translation: Ich bin nur eine Maschine, aber ich habe Maschinenübersicht.
