## Language Translator

In [1]:
import nltk

From `nltk` we can download translated sentences between different languages. You can see the example between **English and French** below but feel free to try different combination as well.

In [2]:
nltk.download('comtrans')

[nltk_data] Downloading package comtrans to
[nltk_data]     C:\Users\Tim\AppData\Roaming\nltk_data...
[nltk_data]   Package comtrans is already up-to-date!


True

In [3]:
from nltk.corpus import comtrans
print(comtrans.aligned_sents('alignment-en-fr.txt')[0])

<AlignedSent: 'Resumption of the se...' -> 'Reprise de la sessio...'>


In [4]:
len(comtrans.aligned_sents('alignment-en-fr.txt'))

33334

In [5]:
english = [line.words for line in comtrans.aligned_sents('alignment-en-fr.txt')]
french = [line.mots for line in comtrans.aligned_sents('alignment-en-fr.txt')]

data = []
for i in range(len(english)):
    eng = ' '.join(english[i])
    fre = ' '.join(french[i])
    data.append([eng,fre])

In [6]:
data[0:5]

[['Resumption of the session', 'Reprise de la session'],
 ['I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999 , and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period .',
  'Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dernier et je vous renouvelle tous mes vux en espérant que vous avez passé de bonnes vacances .'],
 ['You have requested a debate on this subject in the course of the next few days , during this part-session .',
  'Vous avez souhaité un débat à ce sujet dans les prochains jours , au cours de cette période de session .'],
 ["Please rise , then , for this minute ' s silence .",
  'Je vous invite à vous lever pour cette minute de silence .'],
 ["( The House rose and observed a minute ' s silence )",
  '( Le Parlement , debout , observe une minute de silence )']]

In [101]:
import pandas as pd
import numpy as np
from numpy.random import shuffle
import copy
import string
import re

from unicodedata import normalize

from keras.models import Model, Sequential
from keras.layers import Input, LSTM, GRU, Dense, Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import matplotlib.pyplot as plt
from keras.callbacks import ModelCheckpoint

In [8]:
np.array(data).shape

(33334, 2)

In [9]:
#prep
#sample
#tokenize + pad
#embed
#model

In [10]:
def clean(file):
    normalized_documents = []
    table = str.maketrans('', '', string.punctuation)
    
    for pair in file:
        cleaned = list()
        for line in pair:
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            line = line.split()
            line = [i.lower() for i in line]
            line = [i.translate(table) for i in line]
            line = [i for i in line if i.isalpha()]
#             # Remove extra space characters
#             line = re.sub(r'\s+', ' ', line)
#             # Remove distracting characters
#             line = re.sub(r'''[\*\~]+''', "", line)
            cleaned.append(' '.join(line))

        normalized_documents.append(cleaned)
        
    normalized_documents = np.array(normalized_documents)
    return normalized_documents

In [11]:
cleaned_data = clean(data)

data = copy.deepcopy(cleaned_data)
shuffle(data)

In [12]:
data

array([['the resolution makes clear there is a need for a proportional system that gives form to women s representation and that all political parties must take action',
        'la resolution dit clairement que le systeme proportionnel qui permet la representation des femmes est necessaire et que les partis politiques doivent agir en ce sens'],
       ['i hope that turkey begins to play a constructive role in the cyprus issue',
        'j espere que la turquie commencera a jouer un role constructif dans la question chypriote'],
       ['should this be condemned ecologically frau breyer',
        'madame breyer ce qui se passe dans ce pays estil critiquable sur le plan ecologique'],
       ...,
       ['proposal for a resolution by mr dupuis and others on behalf of the tdi group on the un commission on human rights',
        'proposition de resolution deposee par les deputes dupuis et autres au nom du groupe tdi sur la commission des droits de l homme des nations unies'],
       ['many

In [13]:
input_ = []
output_ = []
output_input = []

count = 0
max_num = 10000 #change to sample

for i in data:
    count += 1
    if count > max_num:
        break
    input_sentence = i[0]
    output = i[1]
    output_sentence = output + ' <eos>'
    output_sentence_input = '<sos> ' + output
    
    input_.append(input_sentence)
    output_.append(output_sentence)
    output_input.append(output_sentence_input)

In [14]:
print("num samples input:", len(input_))
print("num samples output:", len(output_))
print("num samples output input:", len(output_input))

num samples input: 10000
num samples output: 10000
num samples output input: 10000


In [222]:
# tokenizing
def tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
def tokenizer2(lines2):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines2)
    return tokenizer
def tokenizer3(lines3):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines3)
    return tokenizer
def tokenizer4(lines4):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines4)
    return tokenizer


# def max_length(lines):
#     return max(len(line.split()) for line in lines)

In [223]:
input_tokenizer = tokenizer(input_)
# input_is = input_tokenizer.texts_to_sequences(input_) # is = integer seq
input_index = input_tokenizer.word_index
num_words_input = len(input_index) + 1



output_tokenizer = tokenizer2(output_ + output_input)
# output_is = output_tokenizer.texts_to_sequences(output_)
# output_input_is = output_tokenizer.texts_to_sequences(output_input)
output_index = output_tokenizer.word_index
num_words_output = len(output_index) + 1
output_tokenizer2 = tokenizer3(output_)
output_tokenizer3 = tokenizer4(output_input)

In [207]:
# padding

# encode_input = pad_sequences(input_is, maxlen = max_input_length)
# print("encoder_input_sequences.shape:", encoder_input.shape)

# decoder_input = pad_sequences(output_input_is, maxlen = max_output_length, padding = 'post')
# print("decoder_input_sequences.shape:", decoder_input.shape)

In [208]:
# word embedding

def padding(tokenizer, length, lines, padding):
    sequences = tokenizer.texts_to_sequences(lines)
    sequences = pad_sequences(sequences,maxlen=length,padding=padding)
    return sequences

def encode_output(seq, sen_length):
    ylist = list()
    for i in seq:
        encoded = to_categorical(seq, num_classes=sen_length)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(seq.shape[0], seq.shape[1], sen_length)
    return y

In [209]:
max_input_length = max(len(sen) for sen in input_tokenizer.texts_to_sequences(input_))
max_output_length = max(len(sen) for sen in output_tokenizer.texts_to_sequences(output_))

encoder_input = padding(input_tokenizer, max_input_length, input_, 'pre')
decoder_input = padding(output_tokenizer, max_output_length, output_, 'post')
y = padding(output_tokenizer, max_output_length, output_, 'post')
# decoder_output = encode_output(y, len(output_index)+1)

In [None]:
max_input_length

In [22]:
decoder_targets_one_hot = np.zeros((
        len(input_),
        max_output_length,
        num_words_output
    ),
    dtype='float32'
)
decoder_targets_one_hot.shape

(10000, 40, 12869)

In [23]:
for i,d in enumerate(y):
    for t, word in enumerate(d):
        decoder_targets_one_hot[i,t,word]=1

In [24]:
# inputs -->
# model - encoder
# embedding
# lstm
# output_inputs -->
# model - decoder
# embedding
# lstm
# outputs -->
# model - dense
# compile
# --> predictions

In [25]:
num_words = min(max_num, len(input_index) + 1)

In [116]:
# model from walkthrough

# encoder
#embed layer
encoder_inp_plc = Input(shape=(max_input_length,))
embedding_layer = Embedding(num_words_input, 100, input_length=max_input_length)
x = embedding_layer(encoder_inp_plc)
#lstm layer
encoder = LSTM(256,return_state=True)

encoder_outputs, h, c = encoder(x)
encoder_states = [h,c]

#decoder
#embed layer
decoder_inp_plc = Input(shape=(max_output_length,))
decoder_embedding = Embedding(num_words_output, 256)
decoder_inp_x = decoder_embedding(decoder_inp_plc)
#lstm layer
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inp_x, initial_state=encoder_states)

#dense layer - predict decoder outputs
decoder_dense = TimeDistributed(Dense(num_words_output, activation = 'softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

#model compilation
model = Model([encoder_inp_plc, decoder_inp_plc], decoder_outputs)
model.compile(
    optimizer = 'adam',
    loss = 'categorical_crossentropy',
    metrics = ['accuracy']
)

In [110]:
from keras.utils import plot_model
plot_model(model, show_shapes=True, show_layer_names=True)

('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')


In [34]:
r = model.fit(
    [encoder_input, decoder_input],
    decoder_targets_one_hot,
    batch_size=64,
    epochs=10,
    validation_split=0.2,
    verbose = 2
)

Epoch 1/10
125/125 - 280s - loss: 4.4361 - accuracy: 0.4816 - val_loss: 3.5080 - val_accuracy: 0.4926
Epoch 2/10
125/125 - 279s - loss: 3.2472 - accuracy: 0.5215 - val_loss: 3.0072 - val_accuracy: 0.5390
Epoch 3/10
125/125 - 275s - loss: 2.7408 - accuracy: 0.5699 - val_loss: 2.5288 - val_accuracy: 0.6195
Epoch 4/10
125/125 - 293s - loss: 2.1814 - accuracy: 0.6913 - val_loss: 1.9746 - val_accuracy: 0.7402
Epoch 5/10
125/125 - 298s - loss: 1.7183 - accuracy: 0.7693 - val_loss: 1.6264 - val_accuracy: 0.7874
Epoch 6/10
125/125 - 293s - loss: 1.4179 - accuracy: 0.8100 - val_loss: 1.3838 - val_accuracy: 0.8311
Epoch 7/10
125/125 - 284s - loss: 1.1894 - accuracy: 0.8484 - val_loss: 1.1896 - val_accuracy: 0.8633
Epoch 8/10
125/125 - 281s - loss: 1.0058 - accuracy: 0.8765 - val_loss: 1.0415 - val_accuracy: 0.8862
Epoch 9/10
125/125 - 295s - loss: 0.8623 - accuracy: 0.8971 - val_loss: 0.9262 - val_accuracy: 0.9030
Epoch 10/10
125/125 - 301s - loss: 0.7488 - accuracy: 0.9120 - val_loss: 0.8325 - 

In [117]:
print(model.summary())

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_10 (InputLayer)           [(None, 39)]         0                                            
__________________________________________________________________________________________________
input_11 (InputLayer)           [(None, 40)]         0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 39, 100)      1008600     input_10[0][0]                   
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 40, 256)      3294464     input_11[0][0]                   
____________________________________________________________________________________________

In [35]:
encoder_model = Model(encoder_inp_plc,encoder_states)
decoder_h = Input(shape=(256,))
decoder_c = Input(shape=(256,))
decoder_states_inputs = [decoder_h, decoder_c]

decoder_inputs_single = Input(shape=(1,))
decoder_x = decoder_embedding(decoder_inputs_single)

decoder_outputs, h, c = decoder_lstm(decoder_x, initial_state= decoder_states_inputs)
decoder_states = [h,c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs_single] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

In [36]:
from keras.utils import plot_model
plot_model(decoder_model, show_shapes=True, show_layer_names=True)

('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')


In [37]:
index2word_inp = {v:k for k, v in input_index.items()}
index2word_out = {v:k for k,v in output_index.items()}

In [61]:
def translate(input_sq):
    value = encoder_model.predict(input_sq)
    target_sq = np.zeros((1,1))
    target_sq[0,0] = output_index['sos']
    eos = output_index['eos']
    output_sentence = []
    
    for _ in range(max_output_length):
        output_tokens, h, c = decoder_model.predict([target_sq]+value)
        idx = np.argmax(output_tokens[0,0,:])
        
        if eos == idx:
            break
        word = ''
        
        if idx > 0:
            word = index2word_out[idx]
            output_sentence.append(word)
        
        target_sq[0,0]=idx
        value = [h,c]
    return ' '.join(output_sentence)

In [65]:
i = np.random.choice(len(input_))
input_sq = encoder_input[i:i+1]
translation = translate(input_sq)
print('Input: ', input_[i])
print('Response: ', translation)

Input:  madam president for the past six months to a year the committee on development and cooperation has been calling on the commission to do something about the drought in ethiopia
Response:  permettezmoi permettezmoi permettezmoi permettezmoi permettezmoi permettezmoi permettezmoi permettezmoi permettezmoi permettezmoi permettezmoi permettezmoi permettezmoi permettezmoi permettezmoi permettezmoi permettezmoi permettezmoi monsieur monsieur monsieur monsieur monsieur monsieur monsieur moi monsieur moi monsieur moi monsieur quelles monsieur eu egalement eu egalement conseil egalement conseil


In [91]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

m = input_[10]
m

'some rights will apply to anyone present in the eu territory while others will benefit only citizens of the european union'

In [92]:
n = output_input[10]
n

'<sos> certains s appliqueront a toute personne presente sur le territoire de l union d autres auront pour seuls beneficiaires les citoyens de l union'

In [93]:
test = [m]
temp = padding(input_tokenizer, max_input_length, test, 'pre')
temp[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   89,   80,   22,  562,
          2,  751,  179,    5,    1,  105, 1192,  537,  382,   22,  752,
         73,  189,    3,    1,   26,   48])

In [94]:
test2 = [n]
temp2 = padding(output_tokenizer, max_output_length, test2, 'post')
temp2[0]

array([   5,  151,   37, 7111,    3,  131,  511,  292,   28,    7,  984,
          1,    6,   51,   12,   82, 1226,   22, 1388, 2121,    8,  160,
          1,    6,   51,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0])

In [95]:
res = model.predict([temp[0].reshape((1, temp[0].shape[0])),temp2[0].reshape((1, temp2[0].shape[0]))], verbose=0)[0]

In [158]:
integers = [np.argmax(vector) for vector in res]
#integers

In [97]:
target = list()
for i in integers:
    word = word_for_id(i, output_tokenizer)
    eos = output_index['eos']
    if word is None:
        break
    if eos == integers:
        break
    target.append(word)
translated = ' '.join(target)
print(translated)

applaudissements certains s signaler a toute personne presente sur le territoire de l union d autres auront pour seuls reelles les citoyens de l union eos eos eos eos eos eos eos eos eos eos


In [98]:
# model 

def define_model(inp_vocab, out_vocab, inp_timesteps, out_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(inp_vocab, n_units, input_length=inp_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(out_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(out_vocab, activation='softmax')))
    return model

In [99]:
# define model
model = define_model(num_words_input, num_words_output, max_input_length, max_output_length, 256)
model.compile(
    optimizer='adam', 
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# summarize defined model
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 39, 256)           2582016   
_________________________________________________________________
lstm_6 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector (RepeatVector) (None, 40, 256)           0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 40, 256)           525312    
_________________________________________________________________
time_distributed_3 (TimeDist (None, 40, 12869)         3307333   
Total params: 6,939,973
Trainable params: 6,939,973
Non-trainable params: 0
_________________________________________________________________
None


In [102]:
checkpoint = ModelCheckpoint('e2f.h5', monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit([encoder_input, decoder_input],
          decoder_targets_one_hot, 
          epochs=15, 
          batch_size=64, 
          validation_split = 0.2, 
          callbacks=[checkpoint], 
          verbose=2)

Epoch 1/15
125/125 - 252s - loss: 4.6230 - accuracy: 0.4804 - val_loss: 4.0521 - val_accuracy: 0.4854

Epoch 00001: val_loss improved from inf to 4.05212, saving model to e2f.h5
Epoch 2/15
125/125 - 262s - loss: 3.9253 - accuracy: 0.4846 - val_loss: 3.8101 - val_accuracy: 0.4852

Epoch 00002: val_loss improved from 4.05212 to 3.81005, saving model to e2f.h5
Epoch 3/15
125/125 - 302s - loss: 3.7063 - accuracy: 0.4828 - val_loss: 3.6953 - val_accuracy: 0.4828

Epoch 00003: val_loss improved from 3.81005 to 3.69534, saving model to e2f.h5
Epoch 4/15
125/125 - 263s - loss: 3.6004 - accuracy: 0.4872 - val_loss: 3.6305 - val_accuracy: 0.4930

Epoch 00004: val_loss improved from 3.69534 to 3.63049, saving model to e2f.h5
Epoch 5/15
125/125 - 263s - loss: 3.5288 - accuracy: 0.4958 - val_loss: 3.6161 - val_accuracy: 0.4986

Epoch 00005: val_loss improved from 3.63049 to 3.61612, saving model to e2f.h5
Epoch 6/15
125/125 - 244s - loss: 3.4810 - accuracy: 0.5007 - val_loss: 3.6143 - val_accuracy:

<tensorflow.python.keras.callbacks.History at 0x2db153d8100>

In [103]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [177]:
m = input_[10]
m

'it is caught between the devil and the deep blue sea'

In [229]:
test = [m]
temp = padding(input_tokenizer, max_input_length, test, 'pre')
temp[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,   14,    6, 3673,  126,    1,
       5926,    4,    1, 1651, 4478, 1432])

In [238]:
test2 = [m]
temp2 = padding(output_tokenizer, max_input_length, test, 'post')
temp2[0]

array([4904, 6354, 2938, 2938,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0])

In [239]:
res = model.predict(temp[0].reshape((1, temp[0].shape[0])), verbose=0)[0]

In [240]:
integers = [np.argmax(vector) for vector in res]
#integers

In [241]:
target = list()
for i in integers:
    word = word_for_id(i, output_tokenizer)
    if word is None:
        break
    target.append(word)
translated = ' '.join(target)
print(translated)

circonstances circonstances circonstances circonstances circonstances circonstances circonstances franchement franchement franchement franchement franchement franchement franchement franchement franchement franchement franchement franchement franchement franchement franchement franchement franchement franchement franchement franchement franchement franchement franchement franchement franchement franchement franchement franchement franchement franchement franchement franchement franchement
