# Preprocessing for NMT Model

This example was taken from the wonderful Cutting Edge Deep Learning for Coders course as taught by Jeremy Howard http://course.fast.ai/part2.html The course is now live and I encourage you to check it out.


In [6]:
%matplotlib inline
import importlib
#import sutils; importlib.reload(sutils)
from sutils import *

import os
import keras
import gensim
import re
import pickle
import collections
import keras.backend as K

from keras_tqdm import TQDMNotebookCallback
from keras import initializers
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import *
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, Callback, ReduceLROnPlateau, LearningRateScheduler, EarlyStopping, TensorBoard
from keras.callbacks import LambdaCallback


from recurrentshop import *
import seq2seq
from seq2seq.models import AttentionSeq2Seq,SimpleSeq2Seq, Seq2Seq

import tensorflow as tf

 we will use **gensim** and **word2vec** to get our embeddings for English

In [7]:
from gensim.models import word2vec

In [8]:
#limit_gpu_mem()

In [10]:
path = '/data/TensorFlowTalks/neural_translation_en_de/'
dpath = '/data/TensorFlowTalks/neural_translation_en_de/translate/'
if not os.path.exists(path):
    os.makedirs(path)
if not os.path.exists(dpath):
    os.makedirs(dpath)

## Preparing the Corpus

we will make a limited corpus of English Questions and their partners in French

In [17]:
# Split train file in two seperat for each language one:

lines = open('/data/wrapper/PA_BA/DataSets/Training/DE_EN_(tatoeba)_train.txt', encoding='UTF-8').read().split('\n')
print(len(lines))
print(lines[0])
lines_de = []
lines_en = []
for line in lines:
    input_text, target_text = line.split('\t')
    lines_en.append(input_text)
    lines_de.append(target_text)
assert len(lines_de) == len(lines_en)    

with(open('/data/wrapper/PA_BA/DataSets/Training/DE_EN_(tatoeba)_train_german_only.txt', 'w', encoding='utf8')) as file:
    for idx in range(len(lines_de)):
        file.write(lines_de[idx])
        if idx != (len(lines_de) - 1):
            file.write("\n")
with(open('/data/wrapper/PA_BA/DataSets/Training/DE_EN_(tatoeba)_train_english_only.txt', 'w', encoding='utf8')) as file:
    for idx in range(len(lines_en)):
        file.write(lines_en[idx])
        if idx != (len(lines_en) - 1):
            file.write("\n")


106975
Write Tom.	Schreibt Tom!


In [32]:
fname = "/data/wrapper/PA_BA/DataSets/Training/" + 'DE_EN_(tatoeba)_train_'
en_fname = fname + 'english_only.txt'
fr_fname = fname + 'german_only.txt'

In [33]:
# this creates the Regex for filtering just for questions
#re_eq = re.compile('^(Wh[^?.!]+\?)')
#re_fq = re.compile('^([^?.!]+\?)')

In [34]:
#this runs our regex search on the full corpus and filters it down
#lines = ((re_eq.search(eq), re_fq.search(fq)) 
#         for eq, fq in zip(open(en_fname, encoding='utf8'), open(fr_fname, encoding='utf8')))

In [44]:
lines = [(en, de) for en, de in zip(open(en_fname, encoding='utf8').read().split('\n'), open(fr_fname, encoding='utf8').read().split('\n'))]
questions = lines
questions[0:5]

[('Write Tom.', 'Schreibt Tom!'),
 ('What do you like to eat?', 'Was essen Sie gerne?'),
 ('Tom grabbed Mary to keep her from falling.',
  'Tom ergriff Maria, auf dass sie nicht falle.'),
 ('You really seem to like beer.',
  'Du scheinst wirklich ein Bierfreund zu sein.'),
 ('Tom is feeding the cows.', 'Tom füttert die Kühe.')]

Now we want to put them all in a list so that we can easily access them

In [36]:
#questions = [(e.group(), f.group()) for e,f in lines if e and f]
#len(questions)

In [37]:
#questions[5:10]

Now lets save this so we can come back to it in the future

In [38]:
dump(questions, dpath+'questions.pkl')

loading and unwrapping the raw English/French questions

In [39]:
questions = load(dpath+'questions.pkl')
en_qs, fr_qs = zip(*questions)

Next we need to split the questions into tokens so that we can make sequences for the model

In [40]:
re_mult_space = re.compile(r"  *")
re_mw_punc = re.compile(r"(\w[’'])(\w)")
re_punc = re.compile("([\"().,;:/_?!—])")
re_apos = re.compile(r"(\w)'s\b")

In [41]:
def simple_toks(sent):
    sent = re_apos.sub(r"\1 's", sent)
    sent = re_mw_punc.sub(r"\1 \2", sent)
    sent = re_punc.sub(r" \1 ", sent).replace('-', ' ')
    sent = re_mult_space.sub(' ', sent)
    return sent.lower().split()

In [46]:
fr_qtoks = list(map(simple_toks, fr_qs)); fr_qtoks[:4]

[['schreibt', 'tom', '!'],
 ['was', 'essen', 'sie', 'gerne', '?'],
 ['tom', 'ergriff', 'maria', ',', 'auf', 'dass', 'sie', 'nicht', 'falle', '.'],
 ['du', 'scheinst', 'wirklich', 'ein', 'bierfreund', 'zu', 'sein', '.']]

In [47]:
en_qtoks = list(map(simple_toks, en_qs)); en_qtoks[:4]

[['write', 'tom', '.'],
 ['what', 'do', 'you', 'like', 'to', 'eat', '?'],
 ['tom', 'grabbed', 'mary', 'to', 'keep', 'her', 'from', 'falling', '.'],
 ['you', 'really', 'seem', 'to', 'like', 'beer', '.']]

Now we need to convert tokens to ids so that we can creat lookup tables   

we also insert the "PAD" token in here

this function returns
ids - for words
vocab -  
w2id - is for looking up the 
voc_cnt - the vocab count

In [48]:
def toks2ids(sents):
    voc_cnt = collections.Counter(t for sent in sents for t in sent)
    vocab = sorted(voc_cnt, key=voc_cnt.get, reverse=True)
    vocab.insert(0, "<PAD>")
    w2id = {w:i for i,w in enumerate(vocab)}
    ids = [[w2id[t] for t in sent] for sent in sents]
    return ids, vocab, w2id, voc_cnt

In [49]:
fr_ids, fr_vocab, fr_w2id, fr_counts = toks2ids(fr_qtoks)
en_ids, en_vocab, en_w2id, en_counts = toks2ids(en_qtoks)
len(en_vocab), len(fr_vocab)

(13187, 26925)

#### Sentences converted to vectors

In [50]:
print(en_ids[1])
print(en_qtoks[1])

[26, 18, 4, 39, 3, 170, 8]
['what', 'do', 'you', 'like', 'to', 'eat', '?']


#### The look up tables / dictionaries 

In [51]:
en_vocab[18]

'do'

In [53]:
en_w2id['do']

18

## Word Embeddings

here we are going to make look up tables for words to embeddings

The GloVE embeddings used here are 400k words with 100 dimensions

In [56]:
def old_load_glove(loc):
    return (load_array(loc+'.txt'),
        pickle.load(open(loc+'_words.pkl','rb'), encoding='latin1'),
        pickle.load(open(loc+'_idx.pkl','rb'), encoding='latin1'))
def load_glove(loc):
    en_wv_word = []
    en_wv_idx = {}
    en_vecs = []
    loc = loc + '.txt'
    with(open(loc, encoding='latin1')) as file:
        lines = file.readlines()
    index = 0
    for line in lines:
        splitted_lines = line.split(' ')
        word = splitted_lines[0]
        en_wv_word.append(word)
        en_wv_idx[word] = index
        index += 1
        en_vecs.append(splitted_lines[1:len(splitted_lines)])
    return (np.asarray(en_vecs), en_wv_word, en_wv_idx)

In [57]:
en_vecs, en_wv_word, en_wv_idx = load_glove('/data/TensorFlowTalks/embeddings/glove/6B.100d')

In [58]:
en_w2v = {w: en_vecs[en_wv_idx[w]] for w in en_wv_word}

In [67]:
n_en_vec, dim_en_vec = en_vecs.shape
dim_fr_vec = 300

In [68]:
print("dim_en_vec", dim_en_vec)
print("n_en_vec", n_en_vec)

dim_en_vec 100
n_en_vec 400000


In [69]:
#fr_wik = pickle.load(open('/data/TensorFlowTalks/embeddings/french/polyglot-fr.pkl', 'rb'), 
#                     encoding='latin1')

The French embeddings were trained by Jean-Philippe Fauconnier

- Word vectors: http://fauconnier.github.io/index.html#wordembeddingmodels
- frWac: http://wacky.sslmit.unibo.it/doku.php?id=corpora

In [70]:
w2v_path='/data/TensorFlowTalks/embeddings/german/word_emb_de.bin'

fr_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_path, binary=True)
fr_voc = fr_model.vocab

In [71]:
def create_emb(w2v, targ_vocab, dim_vec):
    vocab_size = len(targ_vocab)
    emb = np.zeros((vocab_size, dim_vec))

    for i, word in enumerate(targ_vocab):
        try:
            emb[i] = w2v[word]
        except KeyError:
            # If we can't find the word, randomly initialize
            emb[i] = normal(scale=0.6, size=(dim_vec,))

    return emb

In [72]:
en_embs = create_emb(en_w2v, en_vocab, dim_en_vec); en_embs.shape

(13187, 100)

In [73]:
fr_embs = create_emb(fr_model, fr_vocab, dim_fr_vec); fr_embs.shape

(26925, 300)

## Data checks

In [74]:
en_lengths = collections.Counter(len(s) for s in en_ids)

### Keras pad_sequences 

In [75]:
maxlen = 30

In [76]:
en_padded = pad_sequences(en_ids, maxlen, padding="post", truncating="post")

In [77]:
fr_padded = pad_sequences(fr_ids, maxlen, padding="post", truncating="post")

In [78]:
en_padded.shape, fr_padded.shape, en_embs.shape

((106975, 30), (106975, 30), (13187, 100))

In [79]:
n = int(len(en_ids)*0.9)
idxs = np.random.permutation(len(en_ids))
fr_train, fr_test = fr_padded[idxs][:n], fr_padded[idxs][n:]
en_train, en_test = en_padded[idxs][:n], en_padded[idxs][n:]

In [80]:
en_train[0]

array([   4, 3758,   34,   28,  356,  112,  108,    1,    4,  152,   35,
        103, 2817,    1,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0], dtype=int32)

## Saving the data

In [81]:
# needed to save  
look_ups = {'en_w2id':en_w2id,'fr_vocab':fr_vocab,'en_vocab':en_vocab, 'en_embs':en_embs,'fr_embs':fr_embs}
dump(look_ups, dpath+'look_ups.pkl')

In [82]:
data={'fr_train':fr_train,'en_train':en_train,'fr_test':fr_test,'en_test':en_test,}
dump(data, dpath+'nmt_data.pkl')