In [1]:
import numpy as np
import keras.backend as K
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences 
from keras.layers import Dense, Input, LSTM, Bidirectional, Embedding, RepeatVector, Concatenate, Dot, Lambda

Using TensorFlow backend.


In [2]:
# config
BATCH_SIZE = 32
EPOCH = 100
LATENT_DIM = 256 # latent dimensionality of encoding space
NUM_SAMPLES = 10000 # number of sample to train on
MAX_NUM_WORDS = 20000
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_SIZE = 100
LATENT_DIM_DECODER = 256
path = 'data/glove.6B.100d.txt'

In [3]:
# processing dataset 
input_texts = []
target_texts_output = []
target_texts_input = []

t=0
for line in open('data/eng_to_hindi.txt',encoding='utf-8'):
    t+=1
    if t > NUM_SAMPLES:
        break
    if '\t' not in line:
        continue
    input_text, translation = line.split('\t')
    target_text_output = translation.strip() + ' <eos>'
    target_text_input = '<sos> '+ translation.strip()
    
    input_texts.append(input_text)
    target_texts_output.append(target_text_output)
    target_texts_input.append(target_text_input)
print("number of samples : {}".format(len(input_texts)))

number of samples : 2869


In [4]:
# tokenizing sentences 
#input
tokenizer_inputs = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer_inputs.fit_on_texts(input_texts)
input_sequences = tokenizer_inputs.texts_to_sequences(input_texts)
word2idx_inputs = tokenizer_inputs.word_index
print('Unique tokkens in inputs : {}'.format(len(word2idx_inputs)))
max_len_input = max([len(s) for s in input_sequences])

#output
tokenizer_outputs = Tokenizer(num_words=MAX_NUM_WORDS,filters='')
tokenizer_outputs.fit_on_texts(target_texts_output+target_texts_input)
target_sequences_input = tokenizer_outputs.texts_to_sequences(target_texts_input)
target_sequences_output = tokenizer_outputs.texts_to_sequences(target_texts_output)
word2idx_outputs = tokenizer_outputs.word_index
print('Unique tokkens in outputs : {}'.format(len(word2idx_outputs)))
max_len_target = max([len(s) for s in target_sequences_input])
num_words_output = len(word2idx_outputs) + 1

Unique tokkens in inputs : 2402
Unique tokkens in outputs : 3161


In [5]:
# pad sequences
encoder_inputs = pad_sequences(input_sequences,maxlen=max_len_input)
print("encoder shape :",encoder_inputs.shape)
print("encoder_data[0] s:",encoder_inputs[0])

decoder_inputs = pad_sequences(target_sequences_input,maxlen=max_len_target, padding='post')
print("decoder input shape :",decoder_inputs.shape)
print("decoder_input_data[0] s:",decoder_inputs[0])

decoder_targets = pad_sequences(target_sequences_output,maxlen=max_len_target, padding='post')
print("decoder output shape :",decoder_targets.shape)
print("decoder_output_data[0] s:",decoder_targets[0])

encoder shape : (2869, 22)
encoder_data[0] s: [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 90]
decoder input shape : (2869, 26)
decoder_input_data[0] s: [   2 1500    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
decoder output shape : (2869, 26)
decoder_output_data[0] s: [1500    1    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]


In [6]:
def process_embedding_file(path):
    with open(path,encoding='utf-8') as f:
        word2vec = dict()
        for line in f:
            out = line.split()
            word = out[0]
            word2vec[word] = np.asarray(out[1:], dtype='float32')
        return word2vec
            
word2vec = process_embedding_file(path)
print('length of word embeddings : {}'.format(len(word2vec)))

length of word embeddings : 400000


In [7]:
# create weight matrix for words in training sentences
num_words = min(MAX_NUM_WORDS,len(word2idx_inputs)+1)
embedding_matrix = np.zeros((num_words,EMBEDDING_SIZE))
for word,index in word2idx_inputs.items():
    if index < num_words:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector
del word2vec

In [8]:
embedding_layer =Embedding(num_words,
                           EMBEDDING_SIZE,
                           weights=[embedding_matrix],
                           input_length=max_len_input)

In [9]:
decoder_targets_one_hot = np.zeros((len(input_sequences),
                                    max_len_target,
                                    num_words_output),
                                   dtype='float32')

for i,d in enumerate(decoder_targets):
    for j,word in enumerate(d):
        decoder_targets_one_hot[i,j,word] = 1

In [10]:
# build model
encoder_inputs_placeholder = Input(shape=(max_len_input,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = Bidirectional(LSTM(LATENT_DIM,return_sequences=True,dropout=.5))
encoder_outputs = encoder(x)

In [11]:
decoder_inputs_placeholder = Input(shape=(max_len_target,))
decoder_embedding = Embedding(num_words_output,EMBEDDING_SIZE)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

In [12]:
# attention layer
def softmax_over_time(x):
    assert(K.ndim(x)>2)
    e = K.exp(x - K.max(x, axis=1, keepdims=True))
    s = K.sum(e, axis=1, keepdims=True)
    return e/s

attn_repeat_layer = RepeatVector(max_len_input)
attn_concatenate_layer = Concatenate(axis=-1)
attn_dense1 = Dense(10,activation='tanh')
attn_dense2 = Dense(1,activation=softmax_over_time)
attn_dot = Dot(axes=1)

def one_step_attention(h,st_1):
    
    st_1 = attn_repeat_layer(st_1) # shape = (Tx,LATENT_DIM)
    x = attn_concatenate_layer([h, st_1])  # shape= (Tx ,2*LATENT_DIM+LATENT_DIM_DECODER)
    x = attn_dense1(x)
    alphas = attn_dense2(x)
    context = attn_dot([alphas,h])
    return context
    

In [16]:
decoder_lstm = LSTM(LATENT_DIM_DECODER,return_state=True)
decoder_dense = Dense(num_words_output, activation='softmax')
initial_s = Input(shape=(LATENT_DIM_DECODER,),name='s0')
initial_c = Input(shape=(LATENT_DIM_DECODER,),name='c0')
context_last_word_concat_layer = Concatenate(axis=2)

s = initial_s
c = initial_c

outputs = []
for t in range(max_len_target):
    context = one_step_attention(encoder_outputs,s) 
    # we need a different layer for each time step
    selector = Lambda(lambda x:x[:,t:t+1])
    xt = selector(decoder_inputs_x)
    
    decoder_lstm_input = context_last_word_concat_layer([context,xt])
    o, s, c = decoder_lstm(decoder_lstm_input, initial_state=[s,c])
    decoder_outputs = decoder_dense(o)
    outputs.append(decoder_outputs)
    
# outputs is now a list of length Ty
# each element is of shape (batch_size,output_vocab_size)
# therefore we stack all the outouts in 1 tensor
# it would be of shape (T,N,D)
# we would like it to be of shape (N,T,D)

def stack_and_transpose(x):
    x = K.stack(x)
    x = K.permute_dimensions(x, pattern=(1,0,2))
    return x

stacker = Lambda(stack_and_transpose)
outputs = stacker(outputs)

model = Model(inputs=[encoder_inputs_placeholder,
                      decoder_inputs_placeholder,
                      initial_s,
                      initial_c],
             outputs=outputs)
model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])
print(model.summary())

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 22)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 22, 100)      240300      input_1[0][0]                    
__________________________________________________________________________________________________
s0 (InputLayer)                 (None, 256)          0                                            
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 22, 512)      731136      embedding_1[0][0]                
____________________________________________________________________________________________

In [14]:
# train the model
z = np.zeros((len(encoder_inputs),LATENT_DIM_DECODER))
history = model.fit(
               [encoder_inputs,decoder_inputs,z,z], decoder_targets_one_hot,
                batch_size = BATCH_SIZE,
                epochs = EPOCH,validation_split=.2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 2295 samples, validate on 574 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100

KeyboardInterrupt: 

In [None]:
# plot loss data
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

# plot accuracies  data
plt.plot(history.history['accuracy'], label='acc')
plt.plot(history.history['val_accuracy'], label='val_acc')
plt.legend()
plt.show()

In [None]:
# make prediction

encoder_model = Model(encoder_inputs_placeholder,encoder_outputs)
encoder_outputs_as_input = Input(shape=(max_len_input,2*LATENT_DIM,))

decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_x = decoder_embedding(decoder_inputs_single)

context = one_step_attention(encoder_outputs_as_input,initial_s)
decoder_lstm_input = context_last_word_concat_layer([context,decoder_inputs_single_x])
o, s, c = decoder_lstm(decoder_lstm_input, initial_state=[initial_s,initial_c])
decoder_outputs = decoder_dense(o)

decoder_model = Model(inputs = [decoder_inputs_single,encoder_outputs_as_input,initial_s,initial_c],
                    outputs=[decoder_outputs,s,c])

In [None]:
idx2words_inputs = {v:k for k,v in word2idx_inputs.items()}
idx2words_outputs = {v:k for k,v in word2idx_outputs.items()}

In [None]:
def decode_sequence(input_seq):
    enc_out = encoder_model.predict(input_seq)
    # generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    target_seq[0,0] = word2idx_outputs['<sos>']
    eos = word2idx_outputs['<eos>']
    s = np.zeros((1,LATENT_DIM_DECODER))
    c = np.zeros((1,LATENT_DIM_DECODER))
    
    # translation
    output_sentence = []
    for _ in range(max_len_target):
        o,s,c = decoder_model.predict([target_seq,enc_out,s,c])
        
        idx = np.argmax(o.flatten())
        if eos == idx:
            break
        word = ''
        if idx > 0:
            word = idx2words_outputs[idx]
            output_sentence.append(word)
        target_seq[0,0] = idx
    return ' '.join(output_sentence)

In [None]:
input_sen = "how are you "
input_seq = tokenizer_inputs.texts_to_sequences([input_sen])
input_seq =  pad_sequences(input_seq,maxlen=max_len_input)
translation = decode_sequence(input_seq)
print(translation)