# Preprocessing for NMT Model

This example was taken from the wonderful Cutting Edge Deep Learning for Coders course as taught by Jeremy Howard http://course.fast.ai/part2.html The course is now live and I encourage you to check it out.


In [1]:
%matplotlib inline
import importlib
#import sutils; importlib.reload(sutils)
from sutils import *

import os
import keras
import gensim
import re
import pickle
import collections
import keras.backend as K

from keras_tqdm import TQDMNotebookCallback
from keras import initializers
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import *
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, Callback, ReduceLROnPlateau, LearningRateScheduler, EarlyStopping, TensorBoard
from keras.callbacks import LambdaCallback


from recurrentshop import *
import seq2seq
from seq2seq.models import AttentionSeq2Seq,SimpleSeq2Seq, Seq2Seq

import tensorflow as tf

Using TensorFlow backend.


 we will use **gensim** and **word2vec** to get our embeddings for English

In [2]:
from gensim.models import word2vec

In [3]:
SOS = True
EOS = True
UNK = True

In [4]:
path = '../neural_translation_en_de_attention/'
dpath = '../neural_translation_en_de_attention/translate/'
if not os.path.exists(path):
    os.makedirs(path)
if not os.path.exists(dpath):
    os.makedirs(dpath)

## Preparing the Corpus

we will make a limited corpus of English Questions and their partners in French

In [5]:
# Split train file in two seperat for each language one:

lines = open('../DE_EN_(wmt16_google_nmt)_train.txt', encoding='UTF-8').read().split('\n')
print(len(lines))
print(lines[0])
lines_de = []
lines_en = []
for line in lines:
    input_text, target_text = line.split('\t')
    lines_en.append(input_text)
    lines_de.append(target_text)
assert len(lines_de) == len(lines_en)    

with(open('../DE_EN_(wmt16_google_nmt)_train_german_only.txt', 'w', encoding='utf8')) as file:
    for idx in range(len(lines_de)):
        file.write(lines_de[idx])
        if idx != (len(lines_de) - 1):
            file.write("\n")
with(open('../DE_EN_(wmt16_google_nmt)_train_english_only.txt', 'w', encoding='utf8')) as file:
    for idx in range(len(lines_en)):
        file.write(lines_en[idx])
        if idx != (len(lines_en) - 1):
            file.write("\n")


1050001
However, what guarantee would we have that they would not use monopoly power to price these books above the range of ordinary citizens?	Welche Garantie hätten wir jedoch, dass das Unternehmen seine Monopolstellung nicht dazu ausnutzen würde, die Preise dieser Bücher über dem festzulegen, was normale Bürger sich leisten können?


In [6]:
fname = "../" + 'DE_EN_(wmt16_google_nmt)_train_'
en_fname = fname + 'english_only.txt'
fr_fname = fname + 'german_only.txt'

In [7]:
# this creates the Regex for filtering just for questions
#re_eq = re.compile('^(Wh[^?.!]+\?)')
#re_fq = re.compile('^([^?.!]+\?)')

In [8]:
#this runs our regex search on the full corpus and filters it down
#lines = ((re_eq.search(eq), re_fq.search(fq)) 
#         for eq, fq in zip(open(en_fname, encoding='utf8'), open(fr_fname, encoding='utf8')))

In [9]:
lines = [(en, de) for en, de in zip(open(en_fname, encoding='utf8').read().split('\n'), open(fr_fname, encoding='utf8').read().split('\n'))]
questions = lines
questions[0:5]

[('However, what guarantee would we have that they would not use monopoly power to price these books above the range of ordinary citizens?',
  'Welche Garantie hätten wir jedoch, dass das Unternehmen seine Monopolstellung nicht dazu ausnutzen würde, die Preise dieser Bücher über dem festzulegen, was normale Bürger sich leisten können?'),
 ('The debate is closed.', 'Die Aussprache ist geschlossen.'),
 ('That is why there are so few amendments and, furthermore, they promote the message which we must send at the beginning of the process, above all to the other wing of the budgetary authority.',
  'So kommt es, dass es wenig Änderungsanträge gibt, und damit wird die Botschaft unterstützt, die wir zu Beginn des Verfahrens vor allem an den anderen Arm der Haushaltsbehörde aussenden müssen.'),
 ('If necessary, my colleague Evelyne Gebhardt from the Committee on Legal Affairs will represent me.',
  'In diesem Fall würde mich die Kollegin Evelyne Gebhardt aus dem Rechtsausschuß vertreten.'),
 (

Now we want to put them all in a list so that we can easily access them

In [10]:
#questions = [(e.group(), f.group()) for e,f in lines if e and f]
#len(questions)

In [11]:
#questions[5:10]

Now lets save this so we can come back to it in the future

In [12]:
dump(questions, dpath+'questionswmt.pkl')

loading and unwrapping the raw English/French questions

In [13]:
questions = load(dpath+'questionswmt.pkl')
en_qs, fr_qs = zip(*questions)

Next we need to split the questions into tokens so that we can make sequences for the model

In [14]:
re_mult_space = re.compile(r"  *")
re_mw_punc = re.compile(r"(\w[’'])(\w)")
re_punc = re.compile("([\"().,;:/_?!—])")
re_apos = re.compile(r"(\w)'s\b")

In [15]:
def simple_toks(sent):
    sent = re_apos.sub(r"\1 's", sent)
    sent = re_mw_punc.sub(r"\1 \2", sent)
    sent = re_punc.sub(r" \1 ", sent).replace('-', ' ')
    sent = re_mult_space.sub(' ', sent)
    return sent.lower().split()

In [16]:
fr_qtoks = list(map(simple_toks, fr_qs)); fr_qtoks[:4]

[['welche',
  'garantie',
  'hätten',
  'wir',
  'jedoch',
  ',',
  'dass',
  'das',
  'unternehmen',
  'seine',
  'monopolstellung',
  'nicht',
  'dazu',
  'ausnutzen',
  'würde',
  ',',
  'die',
  'preise',
  'dieser',
  'bücher',
  'über',
  'dem',
  'festzulegen',
  ',',
  'was',
  'normale',
  'bürger',
  'sich',
  'leisten',
  'können',
  '?'],
 ['die', 'aussprache', 'ist', 'geschlossen', '.'],
 ['so',
  'kommt',
  'es',
  ',',
  'dass',
  'es',
  'wenig',
  'änderungsanträge',
  'gibt',
  ',',
  'und',
  'damit',
  'wird',
  'die',
  'botschaft',
  'unterstützt',
  ',',
  'die',
  'wir',
  'zu',
  'beginn',
  'des',
  'verfahrens',
  'vor',
  'allem',
  'an',
  'den',
  'anderen',
  'arm',
  'der',
  'haushaltsbehörde',
  'aussenden',
  'müssen',
  '.'],
 ['in',
  'diesem',
  'fall',
  'würde',
  'mich',
  'die',
  'kollegin',
  'evelyne',
  'gebhardt',
  'aus',
  'dem',
  'rechtsausschuß',
  'vertreten',
  '.']]

In [17]:
en_qtoks = list(map(simple_toks, en_qs)); en_qtoks[:4]

[['however',
  ',',
  'what',
  'guarantee',
  'would',
  'we',
  'have',
  'that',
  'they',
  'would',
  'not',
  'use',
  'monopoly',
  'power',
  'to',
  'price',
  'these',
  'books',
  'above',
  'the',
  'range',
  'of',
  'ordinary',
  'citizens',
  '?'],
 ['the', 'debate', 'is', 'closed', '.'],
 ['that',
  'is',
  'why',
  'there',
  'are',
  'so',
  'few',
  'amendments',
  'and',
  ',',
  'furthermore',
  ',',
  'they',
  'promote',
  'the',
  'message',
  'which',
  'we',
  'must',
  'send',
  'at',
  'the',
  'beginning',
  'of',
  'the',
  'process',
  ',',
  'above',
  'all',
  'to',
  'the',
  'other',
  'wing',
  'of',
  'the',
  'budgetary',
  'authority',
  '.'],
 ['if',
  'necessary',
  ',',
  'my',
  'colleague',
  'evelyne',
  'gebhardt',
  'from',
  'the',
  'committee',
  'on',
  'legal',
  'affairs',
  'will',
  'represent',
  'me',
  '.']]

Now we need to convert tokens to ids so that we can creat lookup tables   

we also insert the "PAD" token in here

this function returns
ids - for words
vocab -  
w2id - is for looking up the 
voc_cnt - the vocab count

In [18]:
def toks2ids(sents, are_source_sentences=False):
    voc_cnt = collections.Counter(t for sent in sents for t in sent)
    vocab = sorted(voc_cnt, key=voc_cnt.get, reverse=True)
    
    pad_id = 0
    sos_id = 1
    eos_id = 1
    unk_id = 1
    
    vocab.insert(pad_id, "<PAD>")
    if SOS:
        vocab.insert(sos_id, "<SOS>")
        eos_id +=1
        unk_id +=1
    if EOS:
        vocab.insert(eos_id, "<EOS>")
        unk_id += 1
    if are_source_sentences and UNK:
        vocab.insert(unk_id, "<UNK>")
    
    vocab = vocab[0:40003]
    w2id = {w:i for i,w in enumerate(vocab) if i < 40003}
    ids = []
    for sent in sents:
        sent_ids = []
        if SOS == True:
            sent_ids.append(w2id["<SOS>"])
        for t in sent:
            try:
                sent_ids.append(w2id[t])
            except KeyError:
                if are_source_sentences and UNK:
                    sent_ids.append(w2id["<UNK>"])
        if EOS:
            sent_ids.append(w2id["<EOS>"])
        ids.append(sent_ids)
    print(len(ids),len(vocab),len(w2id),len(voc_cnt))
    return ids, vocab, w2id, voc_cnt, pad_id, sos_id, eos_id, unk_id 

In [19]:
fr_ids, fr_vocab, fr_w2id, fr_counts, pad_id, sos_id, eos_id, unk_id = toks2ids(fr_qtoks)
print(pad_id, sos_id, eos_id, unk_id )
en_ids, en_vocab, en_w2id, en_counts, pad_id, sos_id, eos_id, unk_id = toks2ids(en_qtoks, are_source_sentences=True)
print(pad_id, sos_id, eos_id, unk_id )
len(en_vocab), len(fr_vocab)

1050001 40003 40003 677741
0 1 2 3
1050001 40003 40003 343772
0 1 2 3


(40003, 40003)

#### Sentences converted to vectors

In [20]:
print(en_ids[1])
print(en_qtoks[1])

[1, 4, 243, 12, 1059, 6, 2]
['the', 'debate', 'is', 'closed', '.']


#### The look up tables / dictionaries 

In [21]:
en_vocab[18]

'we'

In [22]:
en_w2id['do']

68

## Word Embeddings

here we are going to make look up tables for words to embeddings

The GloVE embeddings used here are 400k words with 100 dimensions

In [23]:
def old_load_glove(loc):
    return (load_array(loc+'.txt'),
        pickle.load(open(loc+'_words.pkl','rb'), encoding='latin1'),
        pickle.load(open(loc+'_idx.pkl','rb'), encoding='latin1'))
def load_glove(loc):
    en_wv_word = []
    en_wv_idx = {}
    en_vecs = []
    loc = loc + '.txt'
    with(open(loc, encoding='latin1')) as file:
        lines = file.readlines()
    index = 0
    for line in lines:
        splitted_lines = line.split(' ')
        word = splitted_lines[0]
        en_wv_word.append(word)
        en_wv_idx[word] = index
        index += 1
        en_vecs.append(splitted_lines[1:len(splitted_lines)])
    return (np.asarray(en_vecs), en_wv_word, en_wv_idx)

In [24]:
en_vecs, en_wv_word, en_wv_idx = load_glove('../embeddings/glove/6B.200d')

In [25]:
en_w2v = {w: en_vecs[en_wv_idx[w]] for w in en_wv_word}

In [26]:
n_en_vec, dim_en_vec = en_vecs.shape
dim_fr_vec = 300

In [27]:
print("dim_en_vec", dim_en_vec)
print("n_en_vec", n_en_vec)

dim_en_vec 200
n_en_vec 400000


In [31]:
#fr_wik = pickle.load(open('/data/TensorFlowTalks/embeddings/french/polyglot-fr.pkl', 'rb'), 
#                     encoding='latin1')

The French embeddings were trained by Jean-Philippe Fauconnier

- Word vectors: http://fauconnier.github.io/index.html#wordembeddingmodels
- frWac: http://wacky.sslmit.unibo.it/doku.php?id=corpora

In [32]:
w2v_path='../embeddings/german/word_emb_de.bin'

fr_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_path, binary=True)
fr_voc = fr_model.vocab

In [33]:
def create_emb(w2v, targ_vocab, dim_vec):
    vocab_size = 40003
    emb = np.zeros((vocab_size, dim_vec))

    for i, word in enumerate(targ_vocab):
        if i >= vocab_size:
            break
        try:
            emb[i] = w2v[word]
        except KeyError:
            # If we can't find the word, randomly initialize
            emb[i] = normal(scale=0.6, size=(dim_vec,))

    return emb

In [34]:
en_embs = create_emb(en_w2v, en_vocab, dim_en_vec); en_embs.shape

(40003, 200)

In [35]:
fr_embs = create_emb(fr_model, fr_vocab, dim_fr_vec); fr_embs.shape

(40003, 300)

## Data checks

In [36]:
en_lengths = collections.Counter(len(s) for s in en_ids)

### Keras pad_sequences 

In [37]:
maxlen = 100

In [38]:
en_padded = pad_sequences(en_ids, maxlen, padding="post", truncating="post")

In [39]:
fr_padded = pad_sequences(fr_ids, maxlen, padding="post", truncating="post")

In [40]:
print(eos_id,unk_id,pad_id,sos_id)

2 3 0 1


In [41]:
if EOS:
    for sent in en_padded:
        if sent[maxlen-1] != pad_id and sent[maxlen-1] != eos_id:
            sent[maxlen-1] = eos_id

In [42]:
print(en_padded[4915])
print(en_padded[0])

[    1   137    24    39   424  1290    31  7973   673     8   167   207
    15   272     6     3  6960   457  1068  1071  9712    17  8874     6
    10   553     5    38     7    37  1071   272     6     3  6960   457
  1068  9712    17  8874   634   141   549    87    22  5032     5 13117
  1669     5     8    72    34 24871    72 24755  3705     6    18  3446
  1326  2157     5  3446  1703    60     5  3446   694  3027   766     5
   320  2869  4574     5   320   533   463   448   297     5   737  2805
     9  1256    32    93  1109    43 16041     6  4176     5   220   680
  1109    43  6862     2]
[   1  134    5   76  766   51   18   26   13   56   51   27  103 5878  329
    9  434   61 2108  484    4  423    7 3271  303   77    2    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0

In [43]:
if EOS:
    for sent in fr_padded:
        if sent[maxlen-1] != pad_id and sent[maxlen-1] != eos_id:
            sent[maxlen-1] = eos_id

In [44]:
maxlen

100

In [45]:
en_padded.shape, fr_padded.shape, en_embs.shape

((1050001, 100), (1050001, 100), (40003, 200))

In [47]:
n = int(len(en_ids)*0.9)
idxs = np.random.permutation(len(en_ids))
fr_train, fr_test = fr_padded[idxs][:n], fr_padded[idxs][n:]
en_train, en_test = en_padded[idxs][:n], en_padded[idxs][n:]

In [53]:
fr_test_words = []
with(open('../pre_proc_wmt_advanced_attention_sos_eos_unk_validation_data.txt', 'w', encoding='utf8')) as file:
    for idx in idxs[n:]:
        file.write(questions[idx][1])
        file.write('\n')

In [54]:
en_train[0]

array([    1,    98,  5474,  5040,    98,   199,  1652,    10,   289,
           8,     4,   579,     7, 27618, 28531,     6,     2,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,     0], dtype=int32)

In [49]:
#en_train = en_train[0:100000]
#fr_train = fr_train[0:100000]

#en_test = en_test[0:10000]
#fr_test = fr_test[0:10000]

In [55]:
reverse_word_index = dict((i, word) for word, i in fr_w2id.items())
for a in fr_train[0]:
    print(reverse_word_index[a], a)

<SOS> 1
diese 48
erkenntnis 6048
beeinflusste 30363
seinen 251
weiteren 490
und 7
die 5
geschichte 519
von 10
ebm 38355
papst 8058
. 4
<EOS> 2
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0
<PAD> 0


## Saving the data

In [58]:
# needed to save  
look_ups = {'en_w2id':en_w2id,'fr_vocab':fr_vocab,'en_vocab':en_vocab, 'en_embs':en_embs,'fr_embs':fr_embs}
dump(look_ups, dpath+'look_upswmtsmall_sos_eos_unk_att.pkl')

In [59]:
data={'fr_train':fr_train,'en_train':en_train,'fr_test':fr_test,'en_test':en_test,}
dump(data, dpath+'nmt_datawmtsmall_sos_eos_unk_att.pkl')