# Transformer for Tags
### To built a transformer using the 'steps' and 'tags' column of the RAW_recipes.csv

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# Change working directory to be current folder
import os
os.chdir('/content/gdrive/My Drive/Colab Notebooks/')

In [3]:
import numpy as np
import pandas as pd
import pickle
import re
import string
import tensorflow as tf
import time

from tensorflow import keras

print (tf.__version__)

2.6.0


In [47]:
recipes = pd.read_csv("RAW_recipes.csv", usecols=['tags', 'steps'], nrows=10000)
print (recipes)

                                                   tags                                              steps
0     ['60-minutes-or-less', 'time-to-make', 'course...  ['make a choice and proceed with recipe', 'dep...
1     ['30-minutes-or-less', 'time-to-make', 'course...  ['preheat oven to 425 degrees f', 'press dough...
2     ['time-to-make', 'course', 'preparation', 'mai...  ['brown ground beef in large pot', 'add choppe...
3     ['60-minutes-or-less', 'time-to-make', 'course...  ['place potatoes in a large pot of lightly sal...
4     ['weeknight', 'time-to-make', 'course', 'main-...  ['mix all ingredients& boil for 2 1 / 2 hours ...
...                                                 ...                                                ...
9995  ['30-minutes-or-less', 'time-to-make', 'course...  ['mix lemon juice , vinegar , and country dijo...
9996  ['15-minutes-or-less', 'time-to-make', 'course...  ['for sauce , combine in a bowl yogurt , mayon...
9997  ['15-minutes-or-less', 'time-to

In [48]:
tags = recipes['tags']
steps = recipes['steps']

## Preprocessing

In [49]:
def preprocess(sentence):
  # Strip "[]'," from the sentence
  sentence = sentence.translate(str.maketrans('', '', "[]',"))

  # Adding a start and an end token to the sentence so that the model know when to start and stop predicting.
  sentence = '<start> ' + sentence + ' <end>'

  return sentence

In [50]:
preprocessed_tags = ()
for tag in tags:
  tag = preprocess(tag)
  preprocessed_tags += (tag,)

preprocessed_steps = ()
for step in steps:
  step = preprocess(step)
  preprocessed_steps += (step,)

#### Obtaining insights on lengths for defining maxlen

In [51]:
steps_lengths = pd.Series([len(x) for x in steps])
tags_lengths = pd.Series([len(x) for x in tags])

In [52]:
steps_lengths.describe()

count    10000.000000
mean       557.644200
std        407.349132
min          2.000000
25%        294.000000
50%        467.000000
75%        704.250000
max       5979.000000
dtype: float64

In [53]:
tags_lengths.describe()

count    10000.000000
mean       257.868900
std        102.061412
min          4.000000
25%        182.000000
50%        247.000000
75%        321.000000
max        706.000000
dtype: float64

In [54]:
# maxlen
# Taking values > and round figured to 75th percentile
# At the same time not leaving high variance
input_maxlen = 700
target_maxlen = 350

In [55]:
def tokenize(sentence, maxlen):
  tokenizer = tf.keras.preprocessing.text.Tokenizer(
      filters='')
  tokenizer.fit_on_texts(sentence)

  tensor = tokenizer.texts_to_sequences(sentence)

  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, maxlen=maxlen, padding='post', truncating='post')

  return tensor, tokenizer

In [56]:
input_tensor, inp_tokenizer = tokenize(preprocessed_steps, input_maxlen)
target_tensor, targ_tokenizer = tokenize(preprocessed_tags, target_maxlen)

In [14]:
with open('targ_tokenizer_cpu.pickle', 'wb') as handle:
    pickle.dump(targ_tokenizer, handle)

In [57]:
BUFFER_SIZE = len(input_tensor)
BATCH_SIZE = 64
vocab_inp_size = len(inp_tokenizer.word_index)+1
vocab_tar_size = len(targ_tokenizer.word_index)+1
print (BUFFER_SIZE)
print (vocab_inp_size)
print (vocab_tar_size)
dataset = tf.data.Dataset.from_tensor_slices((input_tensor, target_tensor)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

10000
13778
478


### Positional Encoding for adding notion of position among words as unlike RNN this is non-directional

In a transformer, these positional encodings are passed to the encoder and decoder at the beginning.

In [20]:
def get_angles(position, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    
    return position * angle_rates

In [21]:
def positional_encoding(position, d_model):
    angle_rads = get_angles(
        np.arange(position)[:, np.newaxis],
        np.arange(d_model)[np.newaxis, :],
        d_model
    )

    # Apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # Apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

### Masking

- Padding mask for masking "pad" sequences
- Lookahead mask for masking future words from contributing in prediction of current words in self attention

The masking is applied to the decoding stage only.

In [22]:
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    
    return seq[:, tf.newaxis, tf.newaxis, :]

In [23]:
def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    
    return mask

## Building the Model


#### Scaled Dot Product

In [24]:
def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = tf.matmul(q, k, transpose_b=True)
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)  
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
    output = tf.matmul(attention_weights, v)

    return output, attention_weights

#### Multi-Headed Attention

In [25]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)
        
    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)

        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
        output = self.dense(concat_attention)
            
        return output, attention_weights

### Feed Forward Network

In [26]:
def point_wise_feed_forward_network(d_model, dff): # dff is no of neurons in the layer
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),
        tf.keras.layers.Dense(d_model)
    ])

#### Fundamental Unit of Transformer encoder

In [27]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
    
    def call(self, x, training, mask):
        attn_output, _ = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2

#### Fundamental Unit of Transformer decoder

In [28]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)

        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)
    
    
    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)

        attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, padding_mask)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)

        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)

        return out3, attn_weights_block1, attn_weights_block2

#### Encoder consisting of multiple EncoderLayer(s)

In [29]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)

        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)
        
    def call(self, x, training, mask):
        seq_len = tf.shape(x)[1]

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)
    
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)
    
        return x

#### Decoder consisting of multiple DecoderLayer(s)

In [30]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, maximum_position_encoding, rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)
    
    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        seq_len = tf.shape(x)[1]
        attention_weights = {}

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output, training, look_ahead_mask, padding_mask)

            attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
            attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
    
        return x, attention_weights

#### Finally, the Transformer

In [31]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1):
        super(Transformer, self).__init__()

        self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate)

        self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate)

        self.final_layer = tf.keras.layers.Dense(target_vocab_size)
    
    def call(self, inputs, training):
        inp, tar = inputs

        enc_padding_mask, look_ahead_mask, dec_padding_mask  = self.create_masks(inp, tar)

        enc_output = self.encoder(inp, training, enc_padding_mask)

        dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask)

        final_output = self.final_layer(dec_output)

        return final_output, attention_weights

    def create_masks(self, inp, tar):
        # Encoder padding mask
        enc_padding_mask = create_padding_mask(inp)

        # Used in the 2nd attention block in the decoder.
        # This padding mask is used to mask the encoder outputs.
        dec_padding_mask = create_padding_mask(inp)

        # Used in the 1st attention block in the decoder.
        # It is used to pad and mask future tokens in the input received by
        # the decoder.
        look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
        dec_target_padding_mask = create_padding_mask(tar)
        look_ahead_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

        return enc_padding_mask, look_ahead_mask, dec_padding_mask

### Training

In [32]:
# Hyper-params
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
EPOCHS = 10

#### Adam optimizer with custom learning rate scheduling

In [31]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

#### Defining losses and other metrics 

In [32]:
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [33]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

In [34]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

In [35]:
train_loss = tf.keras.metrics.Mean(name='train_loss')

#### Transformer

In [33]:
transformer = Transformer(
    num_layers, 
    d_model, 
    num_heads, 
    dff,
    vocab_inp_size, 
    vocab_tar_size, 
    pe_input=vocab_inp_size, 
    pe_target=vocab_tar_size,
)

#### Checkpoints

In [37]:
checkpoint_path = "checkpoints"

ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if ckpt_manager.latest_checkpoint:
#     ckpt.restore(ckpt_manager.latest_checkpoint)
#     print ('Latest checkpoint restored!!') 

In [38]:
@tf.function
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    with tf.GradientTape() as tape:
        predictions, _ = transformer(
            [inp, tar_inp], 
            True
        )
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)

In [39]:
for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_states()
  
    for (batch, (inp, tar)) in enumerate(dataset):
        train_step(inp, tar)

        if batch % 78 == 0:
            print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, train_loss.result()))
      
    if (epoch + 1) % 5 == 0:
        ckpt_save_path = ckpt_manager.save()
        print ('Saving checkpoint for epoch {} at {}'.format(epoch+1, ckpt_save_path))
    
    print ('Epoch {} Loss {:.4f}'.format(epoch + 1, train_loss.result()))

    print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 6.2951
Epoch 1 Loss 6.2708
Time taken for 1 epoch: 681.9398462772369 secs

Epoch 2 Batch 0 Loss 6.2405
Epoch 2 Loss 6.1051
Time taken for 1 epoch: 608.4823083877563 secs

Epoch 3 Batch 0 Loss 5.9638
Epoch 3 Loss 5.8233
Time taken for 1 epoch: 605.0800099372864 secs

Epoch 4 Batch 0 Loss 5.7017
Epoch 4 Loss 5.5679
Time taken for 1 epoch: 612.1189675331116 secs

Epoch 5 Batch 0 Loss 5.4553
Saving checkpoint for epoch 5 at checkpoints/ckpt-1
Epoch 5 Loss 5.3906
Time taken for 1 epoch: 615.1195859909058 secs

Epoch 6 Batch 0 Loss 5.3439
Epoch 6 Loss 5.2717
Time taken for 1 epoch: 610.9490187168121 secs

Epoch 7 Batch 0 Loss 5.2018
Epoch 7 Loss 5.1640
Time taken for 1 epoch: 607.4905774593353 secs

Epoch 8 Batch 0 Loss 5.1542
Epoch 8 Loss 5.0605
Time taken for 1 epoch: 619.1171875 secs

Epoch 9 Batch 0 Loss 4.9730
Epoch 9 Loss 4.9003
Time taken for 1 epoch: 612.0162568092346 secs

Epoch 10 Batch 0 Loss 4.8009
Saving checkpoint for epoch 10 at checkpoints/ckpt-2
Epoch 10

### Inference
Predicting one word at a time at the decoder and appending it to the output; then taking the complete sequence as an input to the decoder and repeating until maxlen or stop keyword appears

In [34]:
class GenerateTags(tf.Module):
    def __init__(self, inp_tokenizer, targ_tokenizer, transformer):
        super(GenerateTags, self).__init__()
        self.inp_tokenizer = inp_tokenizer
        self.targ_tokenizer = targ_tokenizer
        self.transformer = transformer

    def __call__(self, input_recipe, input_maxlen=700, target_maxlen=350):
        input_recipe = self.inp_tokenizer.texts_to_sequences([input_recipe])
        input_recipe = tf.keras.preprocessing.sequence.pad_sequences(input_recipe, maxlen=input_maxlen, padding='post', truncating='post')

        encoder_input = tf.expand_dims(input_recipe[0], 0)

        decoder_input = [self.targ_tokenizer.word_index["<start>"]]
        output = tf.expand_dims(decoder_input, 0)
        
        for i in range(target_maxlen):
            predictions, attention_weights = self.transformer(
                [encoder_input, output],
                False
            )

            predictions = predictions[: ,-1:, :]
            predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

            if predicted_id == self.targ_tokenizer.word_index["<end>"]:
                break
            else:
                output = tf.concat([output, predicted_id], axis=-1)

        tags = tf.squeeze(output, axis=0)
        tags = tags.numpy()
        tags = np.expand_dims(tags[1:], 0)

        return self.targ_tokenizer.sequences_to_texts(tags)[0]

In [36]:
generate_tags = GenerateTags(inp_tokenizer, targ_tokenizer, transformer)

In [40]:
# Save transformer weights
transformer.save_weights('transformer')

In [35]:
# Load transformer weights
# transformer.load_weights('transformer')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f7ff81c30d0>

In [45]:
sentence = "'make a choice and proceed with recipe', 'depending on size of squash , cut into half or fourths', 'remove seeds', 'for spicy squash , drizzle olive oil or melted butter over each cut squash piece', 'season with mexican seasoning mix ii', 'for sweet squash , drizzle melted honey , butter , grated piloncillo over each cut squash piece', 'season with sweet mexican spice mix', 'bake at 350 degrees , again depending on size , for 40 minutes up to an hour , until a fork can easily pierce the skin', 'be careful not to burn the squash especially if you opt to use sugar or butter', 'if you feel more comfortable , cover the squash with aluminum foil the first half hour , give or take , of baking', 'if desired , season with salt'"
sentence = sentence.translate(str.maketrans('', '', "[]',"))

In [46]:
tags = generate_tags(sentence)
print (tags)

time-to-make time-to-make course preparation preparation preparation dietary
