In [1]:
from keras import layers
import tensorflow as tf
import numpy as np
import keras.backend as K
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model

Using TensorFlow backend.


In [2]:
# config
NUM_SAMPLES = 10000 # number of sample to train on
MAX_NUM_WORDS = 20000
MAX_SEQ_LEN = 26

In [3]:
# processing dataset 
input_texts = []
target_texts_output = []
target_texts_input = []

t=0
for line in open('data/eng_to_hindi.txt',encoding='utf-8'):
    t+=1
    if t > NUM_SAMPLES:
        break
    if '\t' not in line:
        continue
    input_text, translation = line.split('\t')
    target_text_output = translation.strip() + ' <eos>'
    target_text_input = '<sos> '+ translation.strip()
    
    input_texts.append(input_text)
    target_texts_output.append(target_text_output)
    target_texts_input.append(target_text_input)
print("number of samples : {}".format(len(input_texts)))

number of samples : 2869


In [4]:
# tokenizing sentences 
#input
tokenizer_inputs = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer_inputs.fit_on_texts(input_texts)
input_sequences = tokenizer_inputs.texts_to_sequences(input_texts)
word2idx_inputs = tokenizer_inputs.word_index
print('Unique tokkens in inputs : {}'.format(len(word2idx_inputs)))
max_len_input = max([len(s) for s in input_sequences])
print(max_len_input)

#output
tokenizer_outputs = Tokenizer(num_words=MAX_NUM_WORDS,filters='')
tokenizer_outputs.fit_on_texts(target_texts_output+target_texts_input)
target_sequences_input = tokenizer_outputs.texts_to_sequences(target_texts_input)
target_sequences_output = tokenizer_outputs.texts_to_sequences(target_texts_output)
word2idx_outputs = tokenizer_outputs.word_index
print('Unique tokkens in outputs : {}'.format(len(word2idx_outputs)))
max_len_target = max([len(s) for s in target_sequences_input])
num_words_output = len(word2idx_outputs) + 1
print(max_len_target)

Unique tokkens in inputs : 2402
22
Unique tokkens in outputs : 3161
26


In [5]:
# pad sequences
encoder_inputs = pad_sequences(input_sequences,maxlen=MAX_SEQ_LEN)
print("encoder shape :",encoder_inputs.shape)
print("encoder_data[0] s:",encoder_inputs[0])

decoder_inputs = pad_sequences(target_sequences_input,maxlen=MAX_SEQ_LEN, padding='post')
print("decoder input shape :",decoder_inputs.shape)
print("decoder_input_data[0] s:",decoder_inputs[0])

decoder_targets = pad_sequences(target_sequences_output,maxlen=MAX_SEQ_LEN, padding='post')
print("decoder output shape :",decoder_targets.shape)
print("decoder_output_data[0] s:",decoder_targets[0])

encoder shape : (2869, 26)
encoder_data[0] s: [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0 90]
decoder input shape : (2869, 26)
decoder_input_data[0] s: [   2 1500    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
decoder output shape : (2869, 26)
decoder_output_data[0] s: [1500    1    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]


In [6]:
# Building model
class PositionalEncoding(layers.Layer):
    def __init__(self):
        super(PositionalEncoding,self).__init__()
        
    def get_angles(self,pos,i,d_model):#pos : (seq_len ,1)  i : (1,d_model)
        angles = 1 / np.power(10000.0,(2*(i//2))/np.float32(d_model))
        return pos * angles # (seq_len , d_model)
    
    def __call__(self,inputs):
        seq_length = inputs.shape.as_list()[-2]
        d_model = inputs.shape.as_list()[-1]
        angles = self.get_angles(np.arange(seq_length)[:,np.newaxis],
                                 np.arange(d_model)[np.newaxis,:],
                                 d_model)
        angles[:,0::2] = np.sin(angles[:,0::2] ) 
        angles[:,1::2] = np.cos(angles[:,1::2] ) 
        pos_encoding = angles[np.newaxis,...]
        
        return inputs + tf.cast(pos_encoding,tf.float32)

In [7]:
def scaled_dot_product_attention(queris,keys,values,mask):
    product = tf.matmul(queris,keys,b_transpose=True)
    
    keys_dim = tf.cast(tf.shape(keys)[-1],tf.float32)
    scaled_product = product / tf.math.sqrt(keys_dim)
    
    if mask is not None:
        scaled_product += mask*(-1e9)
    
    attention = tf.matmal(tf.nn.softmax(scaled_product, axis=-1), values)
    
    return attention

In [8]:
class MultiHeadAttention(layers.Layer):
    def ___init__(self,nb_proj):
        super(self,MultiHeadAttention).__init__()
        self.nb_proj = nb_proj
        
    def build(self,input_shape):
        self.d_model = input_shape[-1]
        assert self.d_model % self.nb_proj == 0
        self.d_proj = self.d_model / self.nb_proj
        
        self.query_lin = layers.Dense(units=self.d_model)
        self.key_lin = layers.Dense(units=self.d_model)
        self.val_lin = layers.Dense(units=self.d_model)
        
        self.final_lin = layers.Dense(units=self.d_model)
        
    def split_proj(self,inputs, batch_size): # inputs : (batch_size,seq_len,d_model)
        shape = (batch_size,
                 -1,
                 self.nb_proj,
                 self.d_proj)
        
        splited_inputs = tf.reshape(inputs, shape=shape) # (batch_size, seq_len, nb_proj,d_proj)
        return tf.transpose(splited_inputs,perm=[0,2,1,3]) # (batch_size,nb_proj,seq_len,d_proj)
        
    def __call__(self,queries,keys,values,mask):
        batch_size = tf.shape(queries)[0]
        queries = self.query_lin(queries)
        keys = self.key_lin(keys)
        values = self.val_lin(values)
        
        queries = self.split_proj(queries,batch_size)
        keys = self.split_proj(keys,batch_size)
        values = self.split_proj(values,batch_size)
        
        attention = scaled_dot_product_attention(queries, keys, values, mask)
        
        attention = tf.transpose(attention, perm=[0,2,1,3])
        
        concat_attention = tf.reshape(attention,
                                           shape=(batch_size,-1,self.d_model))
        
        output = self.final_lin(concat_attention)
        
        return output
        

In [17]:
class EncoderLayer(layers.Layer):
    def __init__(self,FFN_units,nb_proj,dropout):
        super(EncoderLayer,self).__init__()
        self.nb_prob = nb_proj
        self.FFN_units = FFN_units
        self.dropout = dropout 
        self.multi_head_attention1 = MultiHeadAttention(self.nb_prob)
        
    def build(self,input_shape):
        self.d_model = input_shape[-1]
        
        self.dropout_1 = layers.Dropout(rate=self.dorpout)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)
        
        self.dense_1 = layers.Dense(units=self.FFN_units, activation='relu')
        self.dense_2 = layers.Dense(units=self.d_model)
        self.dropout_2 =  layers.Dropout(rate=self.dorpout)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)
    
    def __call__(self,inputs, mask):
        attention = self.multi_head_attention1(inputs, inputs, inputs, mask)
        attention = self.dropout_1(attention)
        attention = self.norm_1(attention + inputs)
        
        outputs = self.dense_1(attention)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_2(outputs)
        outputs = self.norm_2(attention + outputs)
        
        return outputs
        

In [18]:
class Encoder(layers.Layer):
    def __init__(self,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout,
                 vocab_size,
                 d_model,
                 name='encoder'):
        super(Encoder,self).__init__(name=name)
        self.nb_layers = nb_layers
        self.d_model = d_model
        
        self.embedding = layers.Embedding(vocab_size,d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout)
        self.enc_layers = [EncoderLayer(FFN_units,nb_proj,dropout) for _ in range(self.nb_layers)]
        
    def __call__(self,inputs,mask):
        outputs = self.embedding(inputs)
        outputs *= tf.math.sqrt(tf.cast(self.d_model,tf.float32))
        outputs = self.pos_encoding(outputs)
        outputs = self.dropout(outputs)
        
        for i in range(self.nb_layers):
            outputs = self.enc_layers[i](outputs, mask)
            
        return outputs

In [19]:
class DecoderLayer(layers.Layer):
    def __init__(self,FFN_units,nb_proj,dropout):
        super(DecoderLayer,self).__init__()
        self.FFN_units = FFN_units
        self.nb_proj = nb_proj
        self.dropout = dropout
        
    def build(self,input_shape):
        self.d_model = input_shape[-1]
        self.multi_head_attention_1 = MultiHeadAttention(self.nb_prob)
        self.dropout_1 = layers.Dropout(rate=self.dropout)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)
        
        self.multi_head_attention_2 = MultiHeadAttention(self.nb_prob)
        self.dropout_2 = layers.Dropout(rate=self.dropout)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)
        
        self.dense_1 = layers.Dense(units=self.FFN_units, activation='relu')
        self.dense_2 = layers.Dense(units=d_model)
        self.dropout_3 =  layers.Dropout(rate=self.dropout)
        self.norm_3 = layers.LayerNormalization(epsilon=1e-6)
        
    def __call__(self,inputs,enc_outputs,mask_1,mask_2):
        attention = self.multi_head_attention_1(inputs,inputs,inputs,mask_1)
        attention = self.dropout_1(attention)
        attention = self.norm_1(inputs + attention)
        
        attention_2 = self.multi_head_attention_2(attention,enc_outputs,enc_outputs,mask_2)
        attention_2 = self.dropout_2(attention_2)
        attention_2 = self.norm_2(attention + attention_2)
        
        outputs = self.dense_1(attention_2)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_3(outputs)
        outputs = self.norm_3(attention_2 + outputs)
        
        return outputs
        

In [20]:
class Decoder(layers.Layer):
    def __init__(self,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout,
                 vocab_size,
                 d_model,
                 name='decoder'):
        super(Decoder,self).__init__(name=name)
        self.nb_layers = nb_layers
        self.d_model = d_model
        
        self.embedding = layers.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout)
        
        self.dec_layers = [DecoderLayer(FFN_units,nb_proj,dropout) for _ in range(self.nb_layers)]
        
    def __call__(self,inputs,enc_outputs,mask_1,mask_2):
        outputs = self.embedding(inputs)
        outputs *= tf.math.sqrt(tf.cast(self.d_model,tf.float32))
        outputs = self.pos_encoding(outputs)
        outputs = self.dropout(outputs)
        
        for i in range(self.nb_layers):
            outputs = self.dec_layers[i](outputs,enc_outputs,mask_1,mask_2)
        return outputs
        

In [21]:
class Transformer(layers.Layer):
    def __init__(self,
                 vocab_size_enc,
                 vocab_size_dec,
                 d_model,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout,
                 name='transformer'):
        super(Transformer,self).__init__(name=name)
        
        self.encoder = Encoder(nb_layers,
                               FFN_units,
                               nb_proj,
                               dropout,
                               vocab_size_enc,
                               d_model)
        
        self.decoder = Decoder(nb_layers,
                               FFN_units,
                               nb_proj,
                               dropout,
                               vocab_size_dec,
                               d_model)
        
        self.last_linear = layers.Dense(units=vocab_size_dec)
        
    def create_padding_mask(self,seq): # seq : (batch_size,seq_len)
        mask = tf.cast(tf.math.equal(seq,0),tf.float32)
        return mask[:,tf.newaxis,tf.newaxis,:]
                
    def create_look_ahead_mask(self,seq):
        seq_len = tf.shape(seq)[1]
        look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len,seq_len)),-1,0)
        return look_ahead_mask
    
    def __call__(self,enc_inputs,dec_inputs):
        enc_mask = self.create_padding_mask(enc_inputs)
        dec_mask_1 = tf.maximum(self.create_padding_mask(dec_inputs),self.create_look_ahead_mask(dec_inputs))
        dec_mask_2 = self.create_padding_mask(enc_inputs)
        
        enc_outputs = self.encoder(enc_inputs, enc_mask)
        dec_outputs = self.decoder(dec_inputs,
                                   enc_outputs,
                                   dec_mask_1,
                                   dec_mask_2)
        outputs = self.last_linear(dec_outputs)
        
        return outputs

In [22]:
# Training
tf.keras.backend.clear_session()
# hyper Parameters
D_MODEL = 128 # 512
NB_LAYERS = 4 # 6
FFN_UNITS = 512 # 2048
NB_PROJ = 8 # 8
DROPOUT = .1 # .1
VOCAB_SIZE_EN = len(word2idx_inputs) + 1
VOCAB_SIZE_H = len(word2idx_outputs) + 1

transformer = Transformer( vocab_size_enc=VOCAB_SIZE_EN,
                           vocab_size_dec=VOCAB_SIZE_H,
                           d_model=D_MODEL,
                           nb_layers=NB_LAYERS,
                           FFN_units=FFN_UNITS,
                           nb_proj=NB_PROJ,
                           dropout=DROPOUT)

enc_inputs = layers.Input(shape=(MAX_SEQ_LEN,))
dec_inputs = layers.Input(shape=(MAX_SEQ_LEN,))
outputs = transformer(enc_inputs,dec_inputs)

model = Model(inputs=[enc_inputs,dec_inputs],outputs=outputs)

TypeError: __init__() takes 1 positional argument but 2 were given

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction='none')

def loss_function(target,pred):
    mask = tf.math.logical_not(tf.math.equal(target,0))
    loss_ = loss_object(target,pred)
    
    mask = tf.cast(mask,dtype=loss_.dtype)
    loss_ *=mask
    
    return tf.reduce_mean(loss_)

train_loss = tf.keras.metrics.Mean(name="train_loss")
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name="train_accuracy")


In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    
    def __init__(self,d_model,warmup_steps=4000):
        super(CustomSchedule,self).__init__()
        
        self.warmup_steps = warmup_steps
        self.d_model = tf.cast(d_model, tf.float32)
        
    def __call__(self,step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)
        
        return tf.math.rsqrt(self.d_model*tf.minimum(arg1,arg2))

learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(learning_rate,
                                    beta_1=.9,
                                    beta_2=.98,
                                    epsilon=1e-9)


    

In [None]:
transformer.compile(optimizer=optimizer,loss=loss_function,metrics=train_accuracy)

In [None]:
history = transformer.fit(inputs=[encoder_inputs,decoder_inputs],
                outputs=decoder_targets,
                batch_size = 64,
                epochs = 5)