# Bahdanau Attention

In [None]:
# this implementation might not work due to compatibility issues but concept is correct look at this blog for working code https://blog.paperspace.com/seq-to-seq-attention-mechanism-keras/

import tensorflow as tf
import numpy as np

In [2]:
input_texts = [
    "hello there",
    "how are you",
    "i am fine",
]

target_texts = [
    "hola allí",
    "cómo estás",
    "estoy bien",
]

# for decoder we need to add starting symbol in inputs and ouput symbol in outputs
target_texts_in = ["<SOS> " + t for t in target_texts]
target_texts_out = [t + " <EOS>" for t in target_texts]

In [3]:
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# encoder tokenizer
encoder_tokenizer=Tokenizer(filters='')
encoder_tokenizer.fit_on_texts(input_texts)
encoder_vocab=len(encoder_tokenizer.word_index)+1

# decoder tokenizer
decoder_tokenizer=Tokenizer(filters='')
decoder_tokenizer.fit_on_texts(target_texts_in+target_texts_out)
decoder_vocab=len(decoder_tokenizer.word_index)+1

In [4]:
encoder_input_sequences=encoder_tokenizer.texts_to_sequences(input_texts)
decoder_input_sequences=decoder_tokenizer.texts_to_sequences(target_texts_in)
decoder_output_sequences=decoder_tokenizer.texts_to_sequences(target_texts_out)

In [5]:
max_encoder_len = max(len(s) for s in encoder_input_sequences)
max_decoder_len = max(len(s) for s in decoder_input_sequences)

encoder_input_sequences = pad_sequences(encoder_input_sequences,maxlen=max_encoder_len,padding='post')
decoder_input_sequences = pad_sequences(decoder_input_sequences,maxlen=max_decoder_len,padding='post')
decoder_output_sequences = pad_sequences(decoder_output_sequences,maxlen=max_decoder_len,padding='post')

In [7]:
num_samples = len(target_texts)
embedding_dim = 5
units = 128

In [9]:
# Bahdanau Attention

class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self,units):
        super().__init__()
        self.W1=tf.keras.layers.Dense(units)
        self.W2=tf.keras.layers.Dense(units)
        self.V=tf.keras.layers.Dense(1)
        
    def call(self,hidden_state,encoder_ouputs):
        hidden_with_time_axis = tf.expand_dims(hidden_state, 1)
        
        score=self.V(tf.nn.tanh(self.W1(encoder_ouputs)+self.W2(hidden_with_time_axis)))
        
        attention_weights=tf.nn.softmax(score, axis=1)
        context_vector=attention_weights*encoder_ouputs
        context_vector= tf.reduce_sum(context_vector, axis=1)
        return context_vector,attention_weights
        
        

In [None]:
class DecoderWithAttention(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, units):
        super().__init__()
        self.units = units
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm = tf.keras.layers.LSTM(units, return_state=True)
        self.attention = BahdanauAttention(units)
        self.dense = tf.keras.layers.Dense(vocab_size, activation="softmax")
        
    def call(self, dec_input, hidden_state, cell_state, encoder_outputs):
        embedded=self.embedding(dec_input)
        
        context,attn_weights=self.attention(hidden_state,encoder_outputs)
        context=tf.expand_dims(context,axis=1)
        
        lstm_input=tf.concat([context,embedded],axis=-1)
        
        output,h,c=self.lstm(lstm_input,initial_state=[hidden_state,cell_state])
        
        output=self.dense(output)
        return output,h,c,attn_weights

In [None]:
# Encoder
encoder_input=tf.keras.layers.Input(shape=(None,))
enc_emb=tf.keras.layers.Embedding(encoder_vocab,embedding_dim)(encoder_input)

encoder_lstm=tf.keras.layers.LSTM(units,return_state=True,return_sequences=True)
encoder_ouputs,state_h,state_c=encoder_lstm(enc_emb)
encoder_states=[state_h,state_c]

In [None]:
# Decoder
decoder_input=tf.keras.layers.Input(shape=(None,))

decoder_layer=DecoderWithAttention(decoder_vocab,embedding_dim,units)

all_outputs=[]
hidden,cell=state_h,state_c

for t in range(max_decoder_len):
    dec_in_t=tf.expand_dims(decoder_input[:,t],1)
    output, hidden, cell, _=decoder_layer(dec_in_t,hidden,cell,encoder_ouputs)
    
    all_outputs.append(output)
    
decoder_outputs = tf.concat(all_outputs, axis=1)
training_model = tf.keras.Model([encoder_input, decoder_input], decoder_outputs)
training_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")

In [None]:
# infernece
encoder_model = tf.keras.Model(encoder_input, [encoder_ouputs,state_h,state_c])

# decoder 
dec_token_input = tf.keras.Input(shape=(1,), name="dec_token_input")
dec_h_input = tf.keras.Input(shape=(units,), name="dec_h_input")
dec_c_input = tf.keras.Input(shape=(units,), name="dec_c_input")
enc_out_input = tf.keras.Input(shape=(None, units), name="enc_out_input")

dec_output, dec_h, dec_c, attn = decoder_layer(
    dec_token_input,
    dec_h_input,
    dec_c_input,
    enc_out_input
)

decoder_model = tf.keras.Model(
    [dec_token_input, dec_h_input, dec_c_input, enc_out_input],
    [dec_output, dec_h, dec_c, attn]
)

In [None]:
def decode_sequence(input_seq,max_len=20):
    # Encode source sentence
    enc_out, h, c = encoder_model.predict(input_seq)

    # Initialize with <start>
    dec_token = decoder_tokenizer.word_index['<sos>']
    result = []

    for _ in range(max_len):
        # Run one decoder step
        output, h, c, attn = decoder_model.predict(
            [dec_token, h, c, enc_out]
        )

        # Pick highest probability token
        token_id = np.argmax(output[0, 0, :])
        result.append(token_id)

        if token_id == decoder_tokenizer.word_index['<eos>']:
            break

        # Feed the predicted token next
        dec_token = np.array([[token_id]])

    return result
