In [59]:
# Copyright 2019 The TensorFlow Authors.

#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# https://github.com/tensorflow/docs/blob/master/site/en/tutorials/text/transformer.ipynb

In [60]:
# Modifications Copyright (C) 2020 Rohan Jagtap

In [61]:
# cd /content/drive/My Drive/Colab Notebooks/summarizer_transformer/

In [62]:
# !pip install tensorflow

In [63]:
import pandas as pd
import numpy as np
import tensorflow as tf
import time
import re
import pickle
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow import keras

### Loading Data

In [64]:
news = pd.read_csv("cnn_dailymail/train.csv")

In [65]:
# news = pd.read_excel("data/news.xlsx")

In [66]:
# news.drop(['Source ', 'Time ', 'Publish Date'], axis=1, inplace=True)

In [67]:
news.head()

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [68]:
news.shape

(287113, 3)

In [69]:
news = news[news.article.apply(lambda x: len(x)<400)]

In [70]:
news.shape

(143, 3)

In [71]:
document = news['article']
summary = news['highlights']

In [72]:
news = news.reset_index(drop=True)
document = document.reset_index(drop=True)
summary= summary.reset_index(drop=True)

In [73]:
document

0      (CNN Student News) -- September 2, 2011 . Down...
1      (CNN) -- Former baseball slugger Jose Canseco ...
2      (CNN) -- From E!'s strange "Fun Facts" to Dian...
3      (CNN Student News) -- January 27, 2012 . Downl...
4      (CNN Student News) -- September 14, 2012 . Dow...
                             ...                        
138    (CNN Student News) -- January 4, 2013 . Downlo...
139    (CNN) -- As part of CNN's Defining America pro...
140    (CNN Student News) -- September 9, 2011 . Down...
141    (CNN) -- Living Golf's resident pro Adam Scott...
142    (CNN Student News) -- March 23, 2012 . Downloa...
Name: article, Length: 143, dtype: object

In [74]:
document[30], summary[30]

("It's animals. In the snow. Need we say more? From cats to cows, puppies to camels, check out animals enjoying the winter weather. Have your own snowy pet photos to share? Submit at CNN iReport, and they could be added to the gallery.",
 'See cats, dogs, camels and birds enjoying the winter weather .\nSubmit your own wintry animal photos at CNN iReport .\nAlso check out our gallery of beautiful ice formations .')

### Preprocessing

In [75]:
# for decoder sequence
summary = summary.apply(lambda x: '<go> ' + x + ' <stop>')
summary.head()

0    <go> The daily transcript is a written version...
1    <go> Canseco hit more than 450 home runs .\nHi...
2    <go> Rambling speeches among most interesting ...
3    <go> The daily transcript is a written version...
4    <go> The daily transcript is a written version...
Name: highlights, dtype: object

#### Tokenizing the texts into integer tokens

In [76]:
# since < and > from default tokens cannot be removed
filters = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n'
oov_token = '<unk>'

In [77]:
document_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token=oov_token)
summary_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters=filters, oov_token=oov_token)

In [78]:
document_tokenizer.fit_on_texts(document)
summary_tokenizer.fit_on_texts(summary)

In [79]:
inputs = document_tokenizer.texts_to_sequences(document)
targets = summary_tokenizer.texts_to_sequences(summary)

In [80]:
summary_tokenizer.texts_to_sequences(["This is a test"])

[[17, 15, 13, 35]]

In [81]:
summary_tokenizer.sequences_to_texts([[184, 22, 12, 71]])

['town each student seats']

In [82]:
encoder_vocab_size = len(document_tokenizer.word_index) + 1
decoder_vocab_size = len(summary_tokenizer.word_index) + 1

# vocab_size
encoder_vocab_size, decoder_vocab_size

(1576, 1550)

#### Obtaining insights on lengths for defining maxlen

In [83]:
document_lengths = pd.Series([len(x) for x in document])
summary_lengths = pd.Series([len(x) for x in summary])

In [84]:
document_lengths.describe()

count    143.000000
mean     309.286713
std       57.668995
min       48.000000
25%      296.000000
50%      317.000000
75%      341.000000
max      398.000000
dtype: float64

In [85]:
summary_lengths.describe()

count     143.000000
mean      269.300699
std       399.958537
min        80.000000
25%       158.500000
50%       263.000000
75%       263.000000
max      3888.000000
dtype: float64

In [86]:
# maxlen
# taking values > and round figured to 75th percentile
# at the same time not leaving high variance
encoder_maxlen = 400
decoder_maxlen = 100

#### Padding/Truncating sequences for identical sequence lengths

In [87]:
inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs, maxlen=encoder_maxlen, padding='post', truncating='post')
targets = tf.keras.preprocessing.sequence.pad_sequences(targets, maxlen=decoder_maxlen, padding='post', truncating='post')

In [88]:
inputs

array([[   4,   11,   10, ...,    0,    0,    0],
       [   4,  199,  557, ...,    0,    0,    0],
       [   4,   40,  249, ...,    0,    0,    0],
       ...,
       [   4,   11,   10, ...,    0,    0,    0],
       [   4,  552, 1549, ...,    0,    0,    0],
       [   4,   11,   10, ...,    0,    0,    0]])

### Creating dataset pipeline

In [89]:
inputs = tf.cast(inputs, dtype=tf.int32)
targets = tf.cast(targets, dtype=tf.int32)

In [90]:
inputs[0]

<tf.Tensor: shape=(400,), dtype=int32, numpy=
array([  4,  11,  10,  60,  51,  49,  30,  31,  35,  26,   3,  12,  17,
        52, 554, 555,  52, 149,  52, 556,  97,  18,  16,   3,  27,   2,
        13,   5,  12,   4,  11,  10,  28,  24,  20,  15,  21,  19,  14,
         7,  32,  29,   2,  22,   9,   2,  23,   6,  33,   8,   9,   2,
        13,   6,  25,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,  

In [91]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64

In [92]:
dataset = tf.data.Dataset.from_tensor_slices((inputs, targets)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

### Positional Encoding for adding notion of position among words as unlike RNN this is non-directional

In [93]:
def get_angles(position, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    return position * angle_rates

In [94]:
def positional_encoding(position, d_model):
    angle_rads = get_angles(
        np.arange(position)[:, np.newaxis],
        np.arange(d_model)[np.newaxis, :],
        d_model
    )

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)


### Masking

- Padding mask for masking "pad" sequences
- Lookahead mask for masking future words from contributing in prediction of current words in self attention

In [95]:
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

In [96]:
def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

### Building the Model

#### Scaled Dot Product

In [97]:
def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = tf.matmul(q, k, transpose_b=True)

    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    if mask is not None:
        scaled_attention_logits += (mask * -1e9)  

    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)

    output = tf.matmul(attention_weights, v)
    return output, attention_weights

#### Multi-Headed Attention

In [98]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)
        
    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)

        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
        output = self.dense(concat_attention)
            
        return output, attention_weights

### Feed Forward Network

In [99]:
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),
        tf.keras.layers.Dense(d_model)
    ])

#### Fundamental Unit of Transformer encoder

In [100]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
    
    def call(self, x, training, mask):
        attn_output, _ = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2


#### Fundamental Unit of Transformer decoder

In [101]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)

        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)
    
    
    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)

        attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, padding_mask)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)

        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)

        return out3, attn_weights_block1, attn_weights_block2


#### Encoder consisting of multiple EncoderLayer(s)

In [102]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)

        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)
        
    def call(self, x, training, mask):
        seq_len = tf.shape(x)[1]

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)
    
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)
    
        return x


#### Decoder consisting of multiple DecoderLayer(s)

In [103]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, maximum_position_encoding, rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)
    
    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        seq_len = tf.shape(x)[1]
        attention_weights = {}

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output, training, look_ahead_mask, padding_mask)

            attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
            attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
    
        return x, attention_weights


#### Finally, the Transformer

In [104]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1):
        super(Transformer, self).__init__()

        self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate)

        self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate)

        self.final_layer = tf.keras.layers.Dense(target_vocab_size)
    
    def call(self, inp, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
        enc_output = self.encoder(inp, training, enc_padding_mask)

        dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask)

        final_output = self.final_layer(dec_output)

        return final_output, attention_weights


### Training

In [105]:
# hyper-params
num_layers = 4
d_model = int(128)
dff = 512
num_heads = 8
EPOCHS = 20

In [106]:
type(d_model)

int

#### Adam optimizer with custom learning rate scheduling

In [107]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)


#### Defining losses and other metrics 

In [110]:
learning_rate = CustomSchedule(d_model)
print(learning_rate.from_config)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

<bound method LearningRateSchedule.from_config of <class '__main__.CustomSchedule'>>


InvalidArgumentError: Value for attr 'T' of int64 is not in the list of allowed values: bfloat16, half, float, double, complex64, complex128
	; NodeDef: {{node Rsqrt}}; Op<name=Rsqrt; signature=x:T -> y:T; attr=T:type,allowed=[DT_BFLOAT16, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_COMPLEX128]> [Op:Rsqrt]

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

In [None]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)


In [None]:
train_loss = tf.keras.metrics.Mean(name='train_loss')

#### Transformer

In [None]:
transformer = Transformer(
    num_layers, 
    d_model, 
    num_heads, 
    dff,
    encoder_vocab_size, 
    decoder_vocab_size,
    pe_input=encoder_vocab_size, 
    pe_target=decoder_vocab_size,
    
)

#### Masks

In [None]:
def create_masks(inp, tar):
    enc_padding_mask = create_padding_mask(inp)
    dec_padding_mask = create_padding_mask(inp)

    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
  
    return enc_padding_mask, combined_mask, dec_padding_mask


#### Checkpoints

In [None]:
checkpoint_path = "/kaggle/working/checkpoints"

ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print ('Latest checkpoint restored!!')

#### Training steps

In [None]:
@tf.function
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]
    
    print("inp:", inp, "tar:", tar_inp)
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
    print("enc:", enc_padding_mask,"com:",  combined_mask,"dec:",  dec_padding_mask)

    with tf.GradientTape() as tape:
        predictions, _ = transformer(
            inp, tar_inp, 
            True, 
            enc_padding_mask, 
            combined_mask, 
            dec_padding_mask
        )
        loss = loss_function(tar_real, predictions)
    print("enc2:", enc_padding_mask,"com2:",  combined_mask,"dec2:",  dec_padding_mask)

    gradients = tape.gradient(loss, transformer.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    return inp, tar_inp, enc_padding_mask, combined_mask, dec_padding_mask

In [None]:
for (batch, (inp, tar)) in enumerate(dataset):
    inp, tar_inp, enc_padding_mask, combined_mask, dec_padding_mask= train_step(inp, tar)

inp: Tensor("inp:0", shape=(64, 400), dtype=int32) tar: Tensor("strided_slice:0", shape=(64, 99), dtype=int32)
enc: Tensor("strided_slice_2:0", shape=(64, 1, 1, 400), dtype=float32) com: Tensor("Maximum:0", shape=(64, 1, 99, 99), dtype=float32) dec: Tensor("strided_slice_3:0", shape=(64, 1, 1, 400), dtype=float32)
enc2: Tensor("strided_slice_2:0", shape=(64, 1, 1, 400), dtype=float32) com2: Tensor("Maximum:0", shape=(64, 1, 99, 99), dtype=float32) dec2: Tensor("strided_slice_3:0", shape=(64, 1, 1, 400), dtype=float32)
inp: Tensor("inp:0", shape=(64, 400), dtype=int32) tar: Tensor("strided_slice:0", shape=(64, 99), dtype=int32)
enc: Tensor("strided_slice_2:0", shape=(64, 1, 1, 400), dtype=float32) com: Tensor("Maximum:0", shape=(64, 1, 99, 99), dtype=float32) dec: Tensor("strided_slice_3:0", shape=(64, 1, 1, 400), dtype=float32)
enc2: Tensor("strided_slice_2:0", shape=(64, 1, 1, 400), dtype=float32) com2: Tensor("Maximum:0", shape=(64, 1, 99, 99), dtype=float32) dec2: Tensor("strided_sl

2023-02-13 04:07:57.513203: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


inp: Tensor("inp:0", shape=(15, 400), dtype=int32) tar: Tensor("strided_slice:0", shape=(15, 99), dtype=int32)
enc: Tensor("strided_slice_2:0", shape=(15, 1, 1, 400), dtype=float32) com: Tensor("Maximum:0", shape=(15, 1, 99, 99), dtype=float32) dec: Tensor("strided_slice_3:0", shape=(15, 1, 1, 400), dtype=float32)
enc2: Tensor("strided_slice_2:0", shape=(15, 1, 1, 400), dtype=float32) com2: Tensor("Maximum:0", shape=(15, 1, 99, 99), dtype=float32) dec2: Tensor("strided_slice_3:0", shape=(15, 1, 1, 400), dtype=float32)


In [None]:
inp, tar_inp, enc_padding_mask, combined_mask, dec_padding_mask

(<tf.Tensor: shape=(15, 400), dtype=int32, numpy=
 array([[  4, 839, 840, ...,   0,   0,   0],
        [  4,  11,  10, ...,   0,   0,   0],
        [  4,  11,  10, ...,   0,   0,   0],
        ...,
        [  4, 115,  55, ...,   0,   0,   0],
        [ 58, 303,   4, ...,   0,   0,   0],
        [  4,  11,  10, ...,   0,   0,   0]], dtype=int32)>,
 <tf.Tensor: shape=(15, 99), dtype=int32, numpy=
 array([[   6, 1013,  385, ...,    0,    0,    0],
        [   6,    2,   24, ...,    0,    0,    0],
        [   6,    2,   24, ...,    0,    0,    0],
        ...,
        [   6,   92,   40, ...,    0,    0,    0],
        [   6,  164, 1109, ...,    0,    0,    0],
        [   6,    2,   24, ...,    0,    0,    0]], dtype=int32)>,
 <tf.Tensor: shape=(15, 1, 1, 400), dtype=float32, numpy=
 array([[[[0., 0., 0., ..., 1., 1., 1.]]],
 
 
        [[[0., 0., 0., ..., 1., 1., 1.]]],
 
 
        [[[0., 0., 0., ..., 1., 1., 1.]]],
 
 
        ...,
 
 
        [[[0., 0., 0., ..., 1., 1., 1.]]],
 
 
    

In [None]:
# for epoch in range(EPOCHS):
#     start = time.time()

#     train_loss.reset_states()
  
#     for (batch, (inp, tar)) in enumerate(dataset):
#         train_step(inp, tar)
    
#         # 55k samples
#         # we display 3 batch results -- 0th, middle and last one (approx)
#         # 55k / 64 ~ 858; 858 / 2 = 429
#         if batch % 2 == 0:
#             print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, train_loss.result()))
      
#     if (epoch + 1) % 3 == 0:
#         ckpt_save_path = ckpt_manager.save()
#         print ('Saving checkpoint for epoch {} at {}'.format(epoch+1, ckpt_save_path))
#         # Save the trained model weights
        
        
        
#         transformer.save_weights("/kaggle/working/wghts" + str(epoch + 1) + ".h5")
# #         transformer.build((None, 400))
# #         transformer.save('net', save_format='tf')
    
#     print ('Epoch {} Loss {:.4f}'.format(epoch + 1, train_loss.result()))

#     print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))


### Inference

#### Predicting one word at a time at the decoder and appending it to the output; then taking the complete sequence as an input to the decoder and repeating until maxlen or stop keyword appears

In [None]:
def evaluate(input_document, model):
    input_document = document_tokenizer.texts_to_sequences([input_document])
    input_document = tf.keras.preprocessing.sequence.pad_sequences(input_document, maxlen=encoder_maxlen, padding='post', truncating='post')

    encoder_input = tf.expand_dims(input_document[0], 0)

    decoder_input = [summary_tokenizer.word_index["<go>"]]
    output = tf.expand_dims(decoder_input, 0)
    
    for i in range(decoder_maxlen):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(encoder_input, output)

        predictions, attention_weights = model(
            encoder_input, 
            output,
            False,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask
        )
#         print(predictions)

        predictions = predictions[: ,-1:, :]
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        if predicted_id == summary_tokenizer.word_index["<stop>"]:
            return tf.squeeze(output, axis=0), attention_weights

        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0), attention_weights


In [None]:
def summarize(input_document, model):
    # not considering attention weights for now, can be used to plot attention heatmaps in the future
    summarized = evaluate(input_document=input_document, model=model)[0].numpy()
    summarized = np.expand_dims(summarized[1:], 0)  # not printing <go> token
    return summary_tokenizer.sequences_to_texts(summarized)[0]  # since there is just one translated document

In [None]:
summarize(
    "Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man"
, transformer)

"shallow shallow assist' assist' assist' assist' assist' assist' assist' assist' assist' assist' clyne assist' clyne clyne clyne clyne clyne clyne clyne clyne clyne clyne clyne clyne clyne clyne clyne clyne clyne clyne assist' clyne clyne clyne clyne clyne clyne clyne clyne clyne clyne clyne clyne clyne clyne generation clyne generation clyne clyne clyne clyne generation clyne generation clyne generation clyne generation clyne generation clyne generation clyne generation clyne generation clyne generation clyne generation clyne clyne clyne assist' clyne clyne clyne clyne assist' clyne clyne clyne clyne clyne clyne clyne assist' clyne clyne clyne clyne clyne clyne clyne clyne generation clyne"

In [None]:
inferential_model = Transformer(
    num_layers, 
    d_model, 
    num_heads, 
    dff,
    encoder_vocab_size, 
    decoder_vocab_size, 
    pe_input=encoder_vocab_size, 
    pe_target=decoder_vocab_size,
)

In [None]:
inferential_model(inp, tar_inp, False, enc_padding_mask, combined_mask, dec_padding_mask)

(<tf.Tensor: shape=(15, 99, 1550), dtype=float32, numpy=
 array([[[-0.6856495 ,  0.4332151 ,  0.4247145 , ..., -0.22871181,
           0.18710841, -0.20452479],
         [-0.6940774 ,  0.4382908 ,  0.43065196, ..., -0.22120506,
           0.22329304, -0.19714385],
         [-0.68806714,  0.43086433,  0.4500097 , ..., -0.22121048,
           0.22112831, -0.23878534],
         ...,
         [-0.64473784,  0.36943957,  0.40676373, ..., -0.22221285,
           0.09031451, -0.24121226],
         [-0.6387686 ,  0.36591873,  0.4034636 , ..., -0.22746283,
           0.10020304, -0.24649155],
         [-0.62342316,  0.36080143,  0.40093082, ..., -0.22950836,
           0.10636483, -0.25090307]],
 
        [[-0.76378685,  0.24807967,  0.4416291 , ..., -0.12814073,
           0.09282455, -0.25342423],
         [-0.7207003 ,  0.22737911,  0.43258587, ..., -0.10667025,
           0.12182538, -0.2974558 ],
         [-0.7215494 ,  0.2282853 ,  0.44101852, ..., -0.16040826,
           0.12624247, -0.2

In [None]:
inferential_model.load_weights('/kaggle/input/model-cnn-daily/wghts18-2.h5')

In [None]:
summarize(
    "Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man, Hi How are you man"
, inferential_model)

"the the the transcript is a version of each day's cnn student news program use the transcript to help students with reading comprehension and vocabulary use the weekly newsquiz to test your knowledge of stories you saw on cnn student news"