In [1]:
from keras.models import Model
from keras.layers import Input,Dense,LSTM,GRU,Embedding,Bidirectional,RepeatVector,Concatenate,Activation,Dot,Lambda
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras.backend as k

Using TensorFlow backend.


In [2]:
import numpy as np
import matplotlib.pyplot as plt

In [4]:

#make sure that we do softmax over time axis = 1
#expected shape N X T X D
# N=no. of samples
#T=sequence length
#D vector dimensionality
def softmax_over_time(x):
    assert(k.ndim(x)>2)
    e=k.exp(x-k.max(x,axis=1,keepdims=True))
    s=k.sum(e,axis=1,keepdims=True)
    return e/s  

In [5]:
BATCH_SIZE=64
EPOCHS=1
LATENT_DIM=256
LATENT_DIM_DECODER=256
NUM_SAMPLES=10000
MAX_SEQUENCE_LENGTH=100
MAX_NUM_WORDS=20000
EMBEDDING_DIM=100

In [6]:
input_texts=[]
target_texts=[]
target_input_texts=[]


In [7]:
text=open(r'F:\mukulml\NLP\spa-eng\spa.txt',encoding='utf-8',errors='ignore').read().split('\n')

In [8]:
text[:1]

['Go.\tVe.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #4986655 (cueyayotl)']

In [9]:
for lines in text[:NUM_SAMPLES]:
    eng,spa=lines.split('\t')[:2]
    target_text=spa+' <eos>'
    target_input_text='<sos> ' + spa
    input_texts.append(eng)
    target_texts.append(target_text)
    target_input_texts.append(target_input_text)

In [10]:
input_texts[:2]

['Go.', 'Go.']

In [11]:
target_texts[:2]

['Ve. <eos>', 'Vete. <eos>']

In [12]:
target_input_texts[:2]

['<sos> Ve.', '<sos> Vete.']

In [13]:
#tokenizing inputs
tokenizer=Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(input_texts)
input_sequences=tokenizer.texts_to_sequences(input_texts)


In [14]:
input_sequences[:10]

[[15], [15], [15], [15], [302], [167], [167], [167], [167], [167]]

In [15]:
#word 2 index
word2idx_inputs=tokenizer.word_index
max_len_input=max(len(s) for s in input_sequences)
max_len_input


5

In [16]:
#tokenizing the output 
tokenizer_o=Tokenizer(num_words=MAX_NUM_WORDS,filters='')
tokenizer_o.fit_on_texts(target_texts+target_input_texts)
target_sequences=tokenizer_o.texts_to_sequences(target_texts)
target_input_sequences=tokenizer_o.texts_to_sequences(target_input_texts)


In [17]:
target_sequences[:2]

[[1468, 1], [1004, 1]]

In [18]:
target_input_sequences[:2]

[[2, 1468], [2, 1004]]

In [19]:
word2idx_output=tokenizer_o.word_index
max_len_output=max(len(s) for s in target_sequences)
num_words_output=len(word2idx_output)+1

In [20]:
encoder_inputs=pad_sequences(input_sequences,maxlen=max_len_input,padding='post')


In [21]:
decoder_inputs=pad_sequences(target_input_sequences,maxlen=max_len_output,padding='post')

In [22]:
decoder_inputs[:4]

array([[   2, 1468,    0,    0,    0,    0,    0,    0,    0],
       [   2, 1004,    0,    0,    0,    0,    0,    0,    0],
       [   2,  749,    0,    0,    0,    0,    0,    0,    0],
       [   2, 1005,    0,    0,    0,    0,    0,    0,    0]])

In [23]:
 decoder_inputs[0].shape

(9,)

In [24]:
decoder_targets=pad_sequences(target_sequences,maxlen=max_len_output,padding='post')

In [25]:
decoder_targets[:1]

array([[1468,    1,    0,    0,    0,    0,    0,    0,    0]])

In [26]:
decoder_targets[0].shape

(9,)

In [27]:
print('Loading word vectors...')
word2vec = {}
with open(r'F:\mukulml\NLP\spa-eng\glove.6B.100d.txt',encoding='utf-8') as f:
  # is just a space-separated text file in the format:
  # word vec[0] vec[1] vec[2] ...
  for line in f:
    values = line.split() 
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))

Loading word vectors...
Found 400000 word vectors.


In [28]:
#prepare embedding matrix
num_words=min(MAX_NUM_WORDS,len(word2idx_inputs)+1)
embedding_matrix=np.zeros((num_words,EMBEDDING_DIM))
for word,i in word2idx_inputs.items():
    if i<MAX_NUM_WORDS:
        embedding_vector=word2vec.get(word)
    if embedding_vector is not None:
        embedding_matrix[i]=embedding_vector
        

In [29]:
#creating embedding layer
embedding_layer=Embedding(num_words,EMBEDDING_DIM,weights=[embedding_matrix],input_length=max_len_input)

In [30]:
#creating one hot targets
decoder_one_hot_targets=np.zeros((len(input_texts),max_len_output,num_words_output),dtype='float32')

In [31]:
for i,d in enumerate(decoder_targets):
    for t,word in enumerate(d):
        decoder_one_hot_targets[i,t,word]=1

In [32]:
 ###building model
#setup encoder simple
encoder_input_placeholder=Input(shape=(max_len_input,))
x=embedding_layer(encoder_input_placeholder)
encoder=Bidirectional(LSTM(LATENT_DIM,return_sequences=True,dropout=0.3))
encoder_outputs=encoder(x)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [33]:
decoder_inputs_placeholder=Input(shape=(max_len_output,))


In [34]:
decoder_embedding=Embedding(num_words_output,EMBEDDING_DIM)
decoder_inputs_x=decoder_embedding(decoder_inputs_placeholder)

In [35]:
#attention layer
attn_repeat_layer=RepeatVector(max_len_input)
attn_concat_layer=Concatenate(axis=-1)
attn_dense1=Dense(10,activation='tanh')
attn_dense2=Dense(1,activation=softmax_over_time)
attn_dot=Dot(axes=1)#to perform the weighted sum of alpha(t)*h(t)

In [36]:
def one_step_attention(h,st_1):
    #h=h(1)......h(Tx),shape=(Tx,LATEND_DIM*2)
    #ST_1=s(t-1),shape=(LATENT_DIM_DECODER,)
    
    #copy s(t-1) tx times
    #now shape=(Tx,LATENT_DIM_DECODER)
    st_1=attn_repeat_layer(st_1)
    
    #concat all h(t)'s with s(t-1)
    #now shape (tx,LATENT_DIM_DECODER+LATENT_DIM*2)
    x=attn_concat_layer([h,st_1])
    
    #neural net first layer
    x=attn_dense1(x)
    
    #neural net second layer with special softmax over time
    alphas=attn_dense2(x)
    
    #Dot the alphas and h's
    #a.dot(b)=sum over a[t]*b[t]
    
    context = attn_dot([alphas,h])
    return context

In [37]:
#define rest of the decoder(after attention)
decoder_lstm=LSTM(LATENT_DIM_DECODER,return_state=True)
decoder_dense=Dense(num_words_output,activation='softmax')

In [38]:
initial_s=Input(shape=(LATENT_DIM_DECODER,),name='s0')
initial_c=Input(shape=(LATENT_DIM_DECODER,),name='c0')
context_last_word_concat_layer=Concatenate(axis=2)

In [39]:
# s and c will reassign after each step
s=initial_s
c=initial_c
outputs=[]

#collect output in a list at first
for t in range(max_len_output): #ty times
    #get the context using attention mech
    context=one_step_attention(encoder_outputs,s)
    
    #we need a different layer for each time step
    selector=Lambda(lambda x: x[:,t:t+1])
    xt=selector(decoder_inputs_x)
    
    #combine
    decoder_lstm_input=context_last_word_concat_layer([context,xt])
    
    #pass the combined [context,last word] into lstm
    #along with [s,c]
    #get the new[s,c] and output
    o,s,c=decoder_lstm(decoder_lstm_input,initial_state=[s,c])
    
    #final dense layer to get next word prediction
    decoder_outputs=decoder_dense(o)
    outputs.append(decoder_outputs)

In [40]:
#outputs' is now a list of length Ty
# each element is of shape(batch_size,output vocab)
#therefore we can simply  stack all outputs into 1 tensor
#it would be of shape T X N X D
#we would like it to be of shape N X T X D

def stack_and_transpose(x):
    # x is a lisy of length T, each element is a batch_size x output_vocab_size tensor
    x=k.stack(x)
    x=k.permute_dimensions(x,pattern=(1,0,2)) # is now batch_size x T x output_vocab_size
    return x


In [41]:
#make it a layer
stacker=Lambda(stack_and_transpose)
outputs=stacker(outputs)

In [44]:
model=Model(inputs=[encoder_input_placeholder,decoder_inputs_placeholder,initial_s,initial_c],outputs=outputs)

In [45]:
#compile the model
model.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])

In [49]:
#train the model
z=np.zeros((NUM_SAMPLES,LATENT_DIM_DECODER)) #initial s c
r=model.fit([encoder_inputs,decoder_inputs,z,z],decoder_one_hot_targets,batch_size=BATCH_SIZE,epochs=EPOCHS,validation_split=0.2)


Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 8000 samples, validate on 2000 samples
Epoch 1/1
