# Preprocessing for NMT Model

This example was taken from the wonderful Cutting Edge Deep Learning for Coders course as taught by Jeremy Howard http://course.fast.ai/part2.html The course is now live and I encourage you to check it out.


In [1]:
%matplotlib inline
import importlib
#import sutils; importlib.reload(sutils)
from sutils import *

import os
import keras
import gensim
import re
import pickle
import collections
import keras.backend as K

from keras_tqdm import TQDMNotebookCallback
from keras import initializers
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import *
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, Callback, ReduceLROnPlateau, LearningRateScheduler, EarlyStopping, TensorBoard
from keras.callbacks import LambdaCallback


from recurrentshop import *
import seq2seq
from seq2seq.models import AttentionSeq2Seq,SimpleSeq2Seq, Seq2Seq

import tensorflow as tf

Using TensorFlow backend.


 we will use **gensim** and **word2vec** to get our embeddings for English

In [2]:
from gensim.models import word2vec

In [3]:
#limit_gpu_mem()

In [4]:
path = '../neural_translation_en_de/'
dpath = '../neural_translation_en_de/translate/'
if not os.path.exists(path):
    os.makedirs(path)
if not os.path.exists(dpath):
    os.makedirs(dpath)

## Preparing the Corpus

we will make a limited corpus of English Questions and their partners in French

In [5]:
# Split train file in two seperat for each language one:

lines = open('../DE_EN_(wmt16_google_nmt)_train.txt', encoding='UTF-8').read().split('\n')
print(len(lines))
print(lines[0])
lines_de = []
lines_en = []
for line in lines:
    input_text, target_text = line.split('\t')
    lines_en.append(input_text)
    lines_de.append(target_text)
assert len(lines_de) == len(lines_en)    

with(open('../DE_EN_(wmt16_google_nmt)_train_german_only.txt', 'w', encoding='utf8')) as file:
    for idx in range(len(lines_de)):
        file.write(lines_de[idx])
        if idx != (len(lines_de) - 1):
            file.write("\n")
with(open('../DE_EN_(wmt16_google_nmt)_train_english_only.txt', 'w', encoding='utf8')) as file:
    for idx in range(len(lines_en)):
        file.write(lines_en[idx])
        if idx != (len(lines_en) - 1):
            file.write("\n")


1050001
However, what guarantee would we have that they would not use monopoly power to price these books above the range of ordinary citizens?	Welche Garantie hätten wir jedoch, dass das Unternehmen seine Monopolstellung nicht dazu ausnutzen würde, die Preise dieser Bücher über dem festzulegen, was normale Bürger sich leisten können?


In [6]:
fname = "../" + 'DE_EN_(wmt16_google_nmt)_train_'
en_fname = fname + 'english_only.txt'
fr_fname = fname + 'german_only.txt'

In [7]:
# this creates the Regex for filtering just for questions
#re_eq = re.compile('^(Wh[^?.!]+\?)')
#re_fq = re.compile('^([^?.!]+\?)')

In [8]:
#this runs our regex search on the full corpus and filters it down
#lines = ((re_eq.search(eq), re_fq.search(fq)) 
#         for eq, fq in zip(open(en_fname, encoding='utf8'), open(fr_fname, encoding='utf8')))

In [9]:
lines = [(en, de) for en, de in zip(open(en_fname, encoding='utf8').read().split('\n'), open(fr_fname, encoding='utf8').read().split('\n'))]
questions = lines
questions[0:5]

[('However, what guarantee would we have that they would not use monopoly power to price these books above the range of ordinary citizens?',
  'Welche Garantie hätten wir jedoch, dass das Unternehmen seine Monopolstellung nicht dazu ausnutzen würde, die Preise dieser Bücher über dem festzulegen, was normale Bürger sich leisten können?'),
 ('The debate is closed.', 'Die Aussprache ist geschlossen.'),
 ('That is why there are so few amendments and, furthermore, they promote the message which we must send at the beginning of the process, above all to the other wing of the budgetary authority.',
  'So kommt es, dass es wenig Änderungsanträge gibt, und damit wird die Botschaft unterstützt, die wir zu Beginn des Verfahrens vor allem an den anderen Arm der Haushaltsbehörde aussenden müssen.'),
 ('If necessary, my colleague Evelyne Gebhardt from the Committee on Legal Affairs will represent me.',
  'In diesem Fall würde mich die Kollegin Evelyne Gebhardt aus dem Rechtsausschuß vertreten.'),
 (

Now we want to put them all in a list so that we can easily access them

In [10]:
#questions = [(e.group(), f.group()) for e,f in lines if e and f]
#len(questions)

In [11]:
#questions[5:10]

Now lets save this so we can come back to it in the future

In [12]:
dump(questions, dpath+'questionswmt.pkl')

loading and unwrapping the raw English/French questions

In [13]:
questions = load(dpath+'questionswmt.pkl')
en_qs, fr_qs = zip(*questions)

Next we need to split the questions into tokens so that we can make sequences for the model

In [14]:
re_mult_space = re.compile(r"  *")
re_mw_punc = re.compile(r"(\w[’'])(\w)")
re_punc = re.compile("([\"().,;:/_?!—])")
re_apos = re.compile(r"(\w)'s\b")

In [15]:
def simple_toks(sent):
    sent = re_apos.sub(r"\1 's", sent)
    sent = re_mw_punc.sub(r"\1 \2", sent)
    sent = re_punc.sub(r" \1 ", sent).replace('-', ' ')
    sent = re_mult_space.sub(' ', sent)
    return sent.lower().split()

In [16]:
fr_qtoks = list(map(simple_toks, fr_qs)); fr_qtoks[:4]

[['welche',
  'garantie',
  'hätten',
  'wir',
  'jedoch',
  ',',
  'dass',
  'das',
  'unternehmen',
  'seine',
  'monopolstellung',
  'nicht',
  'dazu',
  'ausnutzen',
  'würde',
  ',',
  'die',
  'preise',
  'dieser',
  'bücher',
  'über',
  'dem',
  'festzulegen',
  ',',
  'was',
  'normale',
  'bürger',
  'sich',
  'leisten',
  'können',
  '?'],
 ['die', 'aussprache', 'ist', 'geschlossen', '.'],
 ['so',
  'kommt',
  'es',
  ',',
  'dass',
  'es',
  'wenig',
  'änderungsanträge',
  'gibt',
  ',',
  'und',
  'damit',
  'wird',
  'die',
  'botschaft',
  'unterstützt',
  ',',
  'die',
  'wir',
  'zu',
  'beginn',
  'des',
  'verfahrens',
  'vor',
  'allem',
  'an',
  'den',
  'anderen',
  'arm',
  'der',
  'haushaltsbehörde',
  'aussenden',
  'müssen',
  '.'],
 ['in',
  'diesem',
  'fall',
  'würde',
  'mich',
  'die',
  'kollegin',
  'evelyne',
  'gebhardt',
  'aus',
  'dem',
  'rechtsausschuß',
  'vertreten',
  '.']]

In [17]:
en_qtoks = list(map(simple_toks, en_qs)); en_qtoks[:4]

[['however',
  ',',
  'what',
  'guarantee',
  'would',
  'we',
  'have',
  'that',
  'they',
  'would',
  'not',
  'use',
  'monopoly',
  'power',
  'to',
  'price',
  'these',
  'books',
  'above',
  'the',
  'range',
  'of',
  'ordinary',
  'citizens',
  '?'],
 ['the', 'debate', 'is', 'closed', '.'],
 ['that',
  'is',
  'why',
  'there',
  'are',
  'so',
  'few',
  'amendments',
  'and',
  ',',
  'furthermore',
  ',',
  'they',
  'promote',
  'the',
  'message',
  'which',
  'we',
  'must',
  'send',
  'at',
  'the',
  'beginning',
  'of',
  'the',
  'process',
  ',',
  'above',
  'all',
  'to',
  'the',
  'other',
  'wing',
  'of',
  'the',
  'budgetary',
  'authority',
  '.'],
 ['if',
  'necessary',
  ',',
  'my',
  'colleague',
  'evelyne',
  'gebhardt',
  'from',
  'the',
  'committee',
  'on',
  'legal',
  'affairs',
  'will',
  'represent',
  'me',
  '.']]

Now we need to convert tokens to ids so that we can creat lookup tables   

we also insert the "PAD" token in here

this function returns
ids - for words
vocab -  
w2id - is for looking up the 
voc_cnt - the vocab count

In [18]:
def toks2ids(sents):
    voc_cnt = collections.Counter(t for sent in sents for t in sent)
    vocab = sorted(voc_cnt, key=voc_cnt.get, reverse=True)
    vocab.insert(0, "<PAD>")
    vocab.insert(1, "<UNK>")
    vocab = vocab[0:40002]
    w2id = {w:i for i,w in enumerate(vocab) if i < 40002}
    ids = []
    for sent in sents:
        sent_ids = []
        for t in sent:
            try:
                sent_ids.append(w2id[t])
            except KeyError:
                sent_ids.append(w2id["<UNK>"])
        ids.append(sent_ids)
    #ids = [[w2id[t] for t in sent] for sent in sents]
    print(len(ids),len(vocab),len(w2id),len(voc_cnt))
    return ids, vocab, w2id, voc_cnt

In [19]:
fr_ids, fr_vocab, fr_w2id, fr_counts = toks2ids(fr_qtoks)
en_ids, en_vocab, en_w2id, en_counts = toks2ids(en_qtoks)
len(en_vocab), len(fr_vocab)

1050001 40002 40002 677741
1050001 40002 40002 343772


(40002, 40002)

#### Sentences converted to vectors

In [21]:
print(en_ids[1])
print(en_qtoks[1])

[2, 241, 10, 1057, 4]
['the', 'debate', 'is', 'closed', '.']


#### The look up tables / dictionaries 

In [22]:
en_vocab[18]

'are'

In [23]:
en_w2id['do']

66

## Word Embeddings

here we are going to make look up tables for words to embeddings

The GloVE embeddings used here are 400k words with 100 dimensions

In [24]:
def old_load_glove(loc):
    return (load_array(loc+'.txt'),
        pickle.load(open(loc+'_words.pkl','rb'), encoding='latin1'),
        pickle.load(open(loc+'_idx.pkl','rb'), encoding='latin1'))
def load_glove(loc):
    en_wv_word = []
    en_wv_idx = {}
    en_vecs = []
    loc = loc + '.txt'
    with(open(loc, encoding='latin1')) as file:
        lines = file.readlines()
    index = 0
    for line in lines:
        splitted_lines = line.split(' ')
        word = splitted_lines[0]
        en_wv_word.append(word)
        en_wv_idx[word] = index
        index += 1
        en_vecs.append(splitted_lines[1:len(splitted_lines)])
    return (np.asarray(en_vecs), en_wv_word, en_wv_idx)

In [25]:
en_vecs, en_wv_word, en_wv_idx = load_glove('../embeddings/glove/6B.200d')

In [26]:
en_w2v = {w: en_vecs[en_wv_idx[w]] for w in en_wv_word}

In [27]:
n_en_vec, dim_en_vec = en_vecs.shape
dim_fr_vec = 300

In [28]:
print("dim_en_vec", dim_en_vec)
print("n_en_vec", n_en_vec)

dim_en_vec 200
n_en_vec 400000


In [29]:
#fr_wik = pickle.load(open('/data/TensorFlowTalks/embeddings/french/polyglot-fr.pkl', 'rb'), 
#                     encoding='latin1')

The French embeddings were trained by Jean-Philippe Fauconnier

- Word vectors: http://fauconnier.github.io/index.html#wordembeddingmodels
- frWac: http://wacky.sslmit.unibo.it/doku.php?id=corpora

In [30]:
w2v_path='../embeddings/german/word_emb_de.bin'

fr_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_path, binary=True)
fr_voc = fr_model.vocab

In [31]:
def create_emb(w2v, targ_vocab, dim_vec):
    vocab_size = 40002
    emb = np.zeros((vocab_size, dim_vec))

    for i, word in enumerate(targ_vocab):
        if i >= vocab_size:
            break
        try:
            emb[i] = w2v[word]
        except KeyError:
            # If we can't find the word, randomly initialize
            emb[i] = normal(scale=0.6, size=(dim_vec,))

    return emb

In [32]:
en_embs = create_emb(en_w2v, en_vocab, dim_en_vec); en_embs.shape

(40002, 200)

In [33]:
fr_embs = create_emb(fr_model, fr_vocab, dim_fr_vec); fr_embs.shape

(40002, 300)

## Data checks

In [34]:
en_lengths = collections.Counter(len(s) for s in en_ids)

### Keras pad_sequences 

In [35]:
maxlen = 100

In [36]:
en_padded = pad_sequences(en_ids, maxlen, padding="post", truncating="post")

In [37]:
fr_padded = pad_sequences(fr_ids, maxlen, padding="post", truncating="post")

In [38]:
en_padded.shape, fr_padded.shape, en_embs.shape

((1050001, 100), (1050001, 100), (40002, 200))

In [39]:
n = int(len(en_ids)*0.9)
idxs = np.random.permutation(len(en_ids))
fr_train, fr_test = fr_padded[idxs][:n], fr_padded[idxs][n:]
en_train, en_test = en_padded[idxs][:n], en_padded[idxs][n:]

In [40]:
en_train[0]

array([   2,   95, 1400,   13,    2,   61,    7,  335,    9, 1280, 1960,
          5, 3633, 1334,    8,    2,  445,    5,    2, 3156,  868, 1101,
         13, 3633, 1334,    3,    6,   21,   37, 2668,   11,    2,  868,
       1101,    7,   19, 1292,   29,    2,  233,    5, 1062,   14,  138,
         26, 1599,    7,    2,  505,  443,    9,  255, 1601,  780,    2,
         80,    5,    2, 1183,    5, 3633, 1334,    4,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0], dtype=int32)

In [41]:
reverse_word_index = dict((i, word) for word, i in fr_w2id.items())
for a in fr_train[0]:
    print(reverse_word_index[a])

im
bericht
wird
die
kommission
ersucht
,
im
rahmen
des
kommenden
grünbuchs
über
den
territorialen
zusammenhalt
eine
umfassende
definition
des
territorialen
zusammenhalts
vorzulegen
,
und
ich
kann
versichern
,
dass
das
für
ende
september
dieses
jahres
zu
erwartende
grünbuch
zu
einem
fortschritt
im
<UNK>
gemeinsamen
verständnis
des
konzepts
der
territorialen
kohäsion
beitragen
wird
.
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>
<PAD>


## Saving the data

In [44]:
# needed to save  
look_ups = {'en_w2id':en_w2id,'fr_vocab':fr_vocab,'en_vocab':en_vocab, 'en_embs':en_embs,'fr_embs':fr_embs}
dump(look_ups, dpath+'look_upswmt.pkl')

In [None]:
data={'fr_train':fr_train,'en_train':en_train,'fr_test':fr_test,'en_test':en_test,}
dump(data, dpath+'nmt_datawmt.pkl')