In [1]:
import importlib
#import sutils; importlib.reload(sutils)
from sutils import *

import keras
import gensim
import re
import pickle
import keras.backend as K

from keras import initializers
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import *
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, Callback, ReduceLROnPlateau, LearningRateScheduler, EarlyStopping, TensorBoard
from keras.callbacks import LambdaCallback


from recurrentshop import *
import seq2seq
from seq2seq.models import AttentionSeq2Seq,SimpleSeq2Seq, Seq2Seq

import tensorflow as tf
from keras_tqdm import TQDMNotebookCallback

Using TensorFlow backend.


In [2]:
print(keras.__version__)
print(tf.__version__)

2.0.8
1.3.0


In [3]:
from gensim.models import word2vec

path = '../neural_translation_en_de/'
dpath = '../neural_translation_en_de/translate/'

In [4]:

# ### Set up Regex and tokenize for use later

re_mult_space = re.compile(r"  *")
re_mw_punc = re.compile(r"(\w[’'])(\w)")
re_punc = re.compile("([\"().,;:/_?!—])")
re_apos = re.compile(r"(\w)'s\b")


def simple_toks(sent):
    sent = re_apos.sub(r"\1 's", sent)
    sent = re_mw_punc.sub(r"\1 \2", sent)
    sent = re_punc.sub(r" \1 ", sent).replace('-', ' ')
    sent = re_mult_space.sub(' ', sent)
    return sent.lower().split()

In [8]:

# ## Load the PreProcessed data
# 
# Here we load all the data 

data = load(dpath+'nmt_datawmtsmall_sos_eos_unk.pkl')
look_ups = load(dpath+'look_upswmtsmall_sos_eos_unk.pkl')
fr_train = data['fr_train']
fr_test = data['fr_test']
en_train = data['en_train']
en_test = data['en_test']
en_w2id = look_ups['en_w2id']
fr_vocab = look_ups['fr_vocab']
en_vocab = look_ups['en_vocab']
en_embs = look_ups['en_embs']
fr_embs = look_ups['fr_embs']

questions = load(dpath+'questionswmt.pkl')
#print(questions[10])
en_qs, fr_qs = zip(*questions)

In [9]:
print(fr_train.shape)
print(en_train.shape)

(100000, 100)
(100000, 100)


In [18]:
with(open("en_de_trans_testing_basic_wmt_advsmall_sos_eos_unk_validation_data.txt", 'w', encoding='utf8')) as file:
    for q in fr_test:
        tt = ""
        for a in q:
            tok = fr_vocab[a]
            if tok not in ["<SOS>","<EOS>","<PAD>"]:
                tt += tok + " "
        file.write(tt + '\n')

In [42]:

# ## Model

# #### Create some Keras Callbacks to handle early stopping and Learning Rate changes

# In[10]:


reduce_LR = ReduceLROnPlateau(monitor='val_loss',factor = 0.5, patience=0,cooldown=1, min_lr = 0.00001)
early_stopping = EarlyStopping(monitor='val_loss',min_delta=0,patience=4,verbose=0,mode='auto')

import math

# learning rate schedule for dropping every 10 epochs
def LRDropping(epoch):
    initial_lrate = 0.001
    drop = 0.9
    epochs_drop = 3.0
    lrate = initial_lrate * math.pow(drop, math.floor((1+epoch)/epochs_drop))
    return lrate

# try at manual setting of LR for Epochs
def fixed_dropping(epoch):
    if epoch < 2: 
        lrate = 0.01
    elif epoch < 4: 
        lrate = 0.001
    elif epoch < 7: 
        lrate = 0.0005
    else:
        lrate = 0.0001
    print(lrate)
    return lrate

LRDrop = LearningRateScheduler(fixed_dropping)


# In[11]:

tbCallBack = TensorBoard(log_dir='/data/model_newGraphadvsmall_sos_eos_unk', write_graph=True)
modelCallback = ModelCheckpoint(
            'model_checkpoint_advsmall_sos_eos_unk.{epoch:03d}-{loss:.3f}.hdf5',
            monitor='val_loss', verbose=1, save_best_only=False,
            save_weights_only=True, mode='auto',
            period=1)

# creating different sets of Params to easily import into the model at train time
params = {'verbose': 1, 'callbacks': [TQDMNotebookCallback(),reduce_LR,early_stopping,tbCallBack,modelCallback]}
params2 = {'verbose': 1, 'callbacks': [LRDrop,TQDMNotebookCallback(),reduce_LR,early_stopping]}
params3 = {'verbose': 1, 'callbacks': [LRDrop,TQDMNotebookCallback(),reduce_LR,early_stopping]}

In [43]:


# #### Set some parameters for the model

# In[12]:


lr = 1e-3
maxlen = 100
dim_en_vec = 200
n_en_vec = 400000
dim_fr_vec = 200

vocab_size = len(fr_vocab) #the output vocab # embeddings.shape[0]
embedding_size = 200 #The english inputs embeddings embeddings.shape[1]


fr_wgts = [fr_embs.T, np.zeros((len(fr_vocab,)))]

In [9]:
# ### The model itself

# Base Model big
inp = Input((maxlen,))
x = Embedding(40003, dim_en_vec, input_length=maxlen,
              weights=[en_embs], trainable=False)(inp)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = LSTM(128, return_sequences=True)(x)
x = TimeDistributed(Dense(dim_fr_vec))(x)
x = TimeDistributed(Dense(40003, weights=fr_wgts))(x)
x = Activation('softmax')(x)

model = Model(inp, x)
model.compile('adam', 'sparse_categorical_crossentropy')


K.set_value(model.optimizer.lr, lr)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 200)          8000600   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 256)          336896    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 256)          394240    
_________________________________________________________________
lstm_3 (LSTM)                (None, 100, 128)          197120    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 100, 200)          25800     
_________________________________________________________________
time_distributed_2 (TimeDist (None, 100, 40003)        8040603   
__________

In [10]:
hist=model.fit(en_train, np.expand_dims(fr_train,axis=-1), batch_size=96, epochs=10, **params, 
               validation_data=[en_test, np.expand_dims(fr_test,axis=-1)])

Train on 100000 samples, validate on 10000 samples


Epoch 1/10


Epoch 2/10


Epoch 3/10


Epoch 4/10


Epoch 5/10


Epoch 6/10


Epoch 7/10


Epoch 8/10


Epoch 9/10


Epoch 10/10



In [12]:
#plot_train(hist)55000epoch2
for a in hist.history:
    print(a, hist.history[a])


loss [1.9045504039001464, 1.6949071324157714, 1.650644474105835, 1.6116637854385376, 1.5739272228240966, 1.5392015573883058, 1.5086121353149413, 1.4816526740646363, 1.4569068114852906, 1.4339287570190429]
val_loss [1.7023146617889404, 1.651752395248413, 1.6190241916656494, 1.5844059438705445, 1.5559279487609863, 1.5338460102081299, 1.517339727783203, 1.5095806240081786, 1.4990445705413817, 1.4903958782196045]
lr [0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001]


In [13]:
weight_identifier = "trans_testing_basic_wmt_advsmall_sos_eos_unk"
model.save_weights(dpath + weight_identifier + '.h5')
model.load_weights(dpath + weight_identifier + '.h5')

In [23]:
# ## Testing
SOS = True
EOS = True
UNK = True
def sent2ids(sent):
    sent = simple_toks(sent)
    ids = []
    if SOS:
        ids.append(en_w2id["<SOS>"])
    for t in sent:
        try:
            ids.append(en_w2id[t])
        except KeyError:
            if UNK:
                ids.append(en_w2id["<UNK>"])
            else:
                pass
    if EOS:
        ids.append(en_w2id["<EOS>"])
    return pad_sequences([ids], maxlen, padding="post", truncating="post")

def en2fr(sent): 
    ids = sent2ids(sent)
    tr_ids = np.argmax(model.predict(ids), axis=-1)
    return ' '.join(fr_vocab[i] for i in tr_ids[0] if i>0 and fr_vocab[i] not in ["<SOS>", "<EOS>"])


In [24]:
en2fr("what is the size of canada?")

'wie ist die der ? ? ?'

In [25]:
en2fr("what is the size of australia?")

'was ist die der ? ? ?'

In [26]:
en2fr("What is light?")

'was ist ? ?'

In [27]:
en2fr("Why is the Arctic ozone layer thicker than the Antarctic ozone layer?")

'warum ist die schwarzmeerregion der der .'

In [28]:
en2fr("Which province is the most populated?")

'das ist ist ist ist ? ?'

In [29]:
en2fr("Who are we?")

'wer sind ? ?'

In [30]:
en2fr("What would we do without it?")

'was wollen wir nicht nicht ?'

In [33]:
en2fr("Hello Tom")

'nl .'

## Prediction

In [34]:
import os

source_file = "/data/wrapper/PA_BA/DataSets/Validation/DE_EN_(tatoeba)_validation_english_only.txt"
if os.path.exists(source_file) is False:
    exit("source file does not exists")

source_sentences = open(source_file, encoding='UTF-8').read().split('\n')
print(len(source_sentences))

translated_sentences = []
i = 0
for sent in source_sentences:
    if i % int((len(source_sentences) / 100)) == 0:
        print(i)
    translated_sentences.append(en2fr(sent))
    i += 1
print(len(translated_sentences))

30563
0
305
610
915
1220
1525
1830
2135
2440
2745
3050
3355
3660
3965
4270
4575
4880
5185
5490
5795
6100
6405
6710
7015
7320
7625
7930
8235
8540
8845
9150
9455
9760
10065
10370
10675
10980
11285
11590
11895
12200
12505
12810
13115
13420
13725
14030
14335
14640
14945
15250
15555
15860
16165
16470
16775
17080
17385
17690
17995
18300
18605
18910
19215
19520
19825
20130
20435
20740
21045
21350
21655
21960
22265
22570
22875
23180
23485
23790
24095
24400
24705
25010
25315
25620
25925
26230
26535
26840
27145
27450
27755
28060
28365
28670
28975
29280
29585
29890
30195
30500
30563


In [44]:
out_file = os.path.join(os.path.abspath(os.path.join(source_file, os.pardir)), weight_identifier + ".pred")
with(open(out_file, 'w', encoding='utf8')) as file:
    for sent in translated_sentences:
        file.write(sent + '\n')

In [None]:
todo:
    python model_wmt_adv_notebook__sos_eos_unk_big.py
    python model_wmt_adv_notebook__sos_eos_unk_big_my_preproc.py   muss ich noch erstellen