# Packages

In [None]:
!pip install hazm

In [None]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import string
import unicodedata
import re
import numpy as np
import os
import io
import time
import pickle
import pandas as pd
import hazm
import gc
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_colwidth', None)
from termcolor import colored
from itertools import chain
#from transformers import BertTokenizer, BertModel
who_am_i = 'Mitra'

# Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
all_data = pd.read_csv('.../ProsPoemParallelDataset_augmented.csv')


In [None]:
all_data.head(2)

# PreProcessing + Creating Inputs

In [None]:
normalizer = hazm.Normalizer(persian_numbers=False)

def process_sents(text):
    
    # separate dot or / from text with
    # one white space
    text = normalizer.normalize(text)

    text = re.sub(r'([\/\.])', r' \1', text)

    # substitute / with sep between mesras
    text = re.sub(r' *\/ *', ' <sep> ', text)
    
    # substitute any white space with one space
    text = re.sub(r'\s+', ' ', text)
    
    # add start and end tokens
    text = '<start> ' + text + ' <end>'
    
    return text


In [None]:
def tokenize(lang):
    # use keras defualt tokenizer
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
        filters=''
    )
    # fit on the vocabulary used in text
    lang_tokenizer.fit_on_texts(lang)

    # convert to ids
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                           padding = 'post')
    
    # add sep to the tokenizer
    #idx_sep = len(lang_tokenizer.index_word.keys())+1#[-1]

    #lang_tokenizer.word_index['<sep>'] = idx_sep
    #lang_tokenizer.index_word[idx_sep] = '<sep>'


    return tensor, lang_tokenizer

In [None]:
def create_load_dataset(df):

    input_lang = df.loc[:, 'text'].values.tolist()
    target_lang = df.loc[:, 'poetry'].values.tolist()

    # preprocess each sentence
    input_lang = [process_sents(text) for text in input_lang]
    target_lang = [process_sents(text) for text in target_lang]

    # create a tensor and tokenizer for each language
    input_tensor, input_lang_tokenizer = tokenize(input_lang)
    target_tensor, target_lang_tokenizer = tokenize(target_lang)

    return input_tensor, target_tensor, input_lang_tokenizer, target_lang_tokenizer

In [None]:
input_tensor, target_tensor,\
input_lang_tokenizer, target_lang_tokenizer = create_load_dataset(all_data)


In [None]:
max_len_input = input_tensor.shape[1]
max_len_target = target_tensor.shape[1]

print('longest sequence and the length of texts: ',
      colored(max_len_input, 'blue'))
print('longest sequence and the length of poetries: ',
      colored(max_len_target, 'blue'))

In [None]:
print('Start token:',  target_lang_tokenizer.word_index['<start>'])
print('End token:',  target_lang_tokenizer.word_index['<end>'])

# Vocabulary

In [None]:
target_lang_tokenizer.word_index['<sep>']

In [None]:
#target_lang_tokenizer.index_word[10434]

In [None]:
#target_lang_tokenizer.index_word[3]

In [None]:
#target_lang_tokenizer.index_word

In [None]:
# lenght of constructed vocabularies:
# 1 for padding
vocab_len_i = len(input_lang_tokenizer.index_word) + 1
print("Plain text vocab has", colored(f"{vocab_len_i:,}", 'green'), "unique words.")

vocab_len_t = len(target_lang_tokenizer.index_word) + 1
print(f"Poetry vocab has", colored(f"{vocab_len_t:,}", 'green'), "unique words.")


In [None]:
def convert(text, poetry):


    print(colored('Text:', 'green'))
    for i in text:
        if i!=0:
            print("%d -----> %s"%(i, input_lang_tokenizer.index_word[i]))
        
    print(colored('\nPoetry:', 'green'))
    for i in poetry:
        if i!=0:
            print("%d -----> %s"%(i, target_lang_tokenizer.index_word[i]))

In [None]:
print(colored('Text: ', 'blue'), all_data.loc[5, 'text'])
print(colored('Poetry: ', 'blue'), all_data.loc[5, 'poetry'])
convert(input_tensor[5], target_tensor[5])

# Create the Input

In [None]:
val_indices = pd.read_pickle('.../validation_indices_le')
train_indices = pd.read_pickle('.../train_indices_.pickle')

In [None]:
all_data.loc[val_indices]

In [None]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val =\
input_tensor[train_indices], input_tensor[val_indices],  target_tensor[train_indices], target_tensor[val_indices]

In [None]:
"""
# segmenting the dataset
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = \
train_test_split(pd.Series(input_tensor.tolist()),
                 pd.Series(target_tensor.tolist()),
                 test_size=0.06, shuffle = True)
"""

print('Length of train and val:', 
      colored(f"{len(input_tensor_train), len(input_tensor_val)}", 'blue'))

In [None]:
"""
with open('.../validation_indices_.pickle', 'wb') as f:
    pickle.dump(input_tensor_val.index, f)

with open('.../train_indices_.pickle', 'wb') as f:
    pickle.dump(train_.index, f)
"""

In [None]:
# defining the main parameters of the model
# and the inputs

len_data = len(input_tensor_train)
batch_s = 128
steps_per_epoch = len_data // batch_s
embedding_dim = 256
units = 1024

In [None]:
np.array(input_tensor_train.tolist(), dtype='int32')

In [None]:
# create the dataset and shuffle all
len_data_train = len(input_tensor_train)
len_data_test = len(target_tensor_val)

# creat the datasets and put them in batches

train_batches = tf.data.Dataset.from_tensor_slices((
    np.array(input_tensor_train.tolist(), dtype='int32'),
     np.array(target_tensor_train.tolist(), dtype='int32')
)).shuffle(len_data_train).batch(batch_s, drop_remainder=True)



# Positional Encoding


In [None]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

In [None]:
def positional_encoding(position, d_model):

    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)
    
    # sine to even indices
    # start with 0 and jump every 1 indices
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # cosine to odd indices
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

In [None]:
list(range(10)[0::2])

In [None]:
n_position = 1024
d_model = 512
pos_encoding = positional_encoding(n_position, d_model)
print(pos_encoding.shape)
pos_encoding = pos_encoding[0]

# ----------------irrelevant-----------
# Juggle the dimensions for the plot
pos_encoding = tf.reshape(pos_encoding, (n_position, d_model//2, 2))
pos_encoding = tf.transpose(pos_encoding, (2, 1, 0))
pos_encoding = tf.reshape(pos_encoding, (d_model, n_position))

plt.pcolormesh(pos_encoding, cmap='RdBu')
plt.ylabel('Depth')
plt.xlabel('Position')
plt.colorbar()
plt.show()

# Padding

In [None]:
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

    # add extra dimention to add the padding
    return seq[:, tf.newaxis, tf.newaxis, :]

In [None]:
x = tf.constant([[7, 6, 0, 0, 1], 
                 [1, 2, 3, 0, 0], 
                 [0, 0, 0, 4, 5]])



In [None]:
tf.cast(tf.math.equal(x, 0), tf.float32)

In [None]:
create_padding_mask(x)

In [None]:
def create_look_ahead_mask(size):
    # put 1 where the sequence has to be masked
    # in here the lower triangular part will be zero. 
    # then we say 1- this which will be the upper without
    # the diagonal axis
    # 1 = mask
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0 )
    return mask # (seq_len, seq_len)

In [None]:
x = tf.random.uniform((1,3))
x

In [None]:
x.shape[1]

In [None]:
create_look_ahead_mask(x.shape[1])

# Scaled Dot Product

In [None]:
def scaled_dot_product_attention(q, k, v, mask):

    # multiplying q and k first and transposing k
    matmul_qk = tf.matmul(q, k, transpose_b=True)

    # we scale it using the depth of model
    d_k = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(d_k)

    # adding mask to the scaled attention
    # by multiplying the mask(1) with a very small number
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)

    # softmax on the last dimention which makes the sequence length
    # of k sequence length be between 0 and 1
    # meaning: which one in the input is more important
    attention_weights = tf.nn.softmax(scaled_attention_logits, 
                                      axis=-1)
    
    output = tf.matmul(attention_weights, v)
    # (..., seq_len_q, depth_v)

    return output, attention_weights

In [None]:
def print_att(q, k, v):
    output, att_weights = scaled_dot_product_attention(
        q, k, v, None
    )
    print('Attention weights: ', att_weights)
    print('Output is:', output)


In [None]:
np.set_printoptions(suppress=True)

k = tf.constant([[10, 0, 0],
                [0, 10, 0],
                [0, 0, 10],
                [0, 0, 10]], dtype=tf.float32)  # (4, 3)


v = tf.constant([[1, 0],
                [10, 0],
                [100, 5],
                [1000, 6]], dtype=tf.float32)  # (4, 2)

q = tf.constant([[0, 10, 0]], dtype=tf.float32)  # (1, 3)
print_att(q, k, v)

In [None]:
# This query aligns with a repeated key (third and fourth),
# so all associated values get averaged.
temp_q = tf.constant([[0, 0, 10]], dtype=tf.float32)  # (1, 3)
print_att(temp_q, k, v)

# Multihead

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):

        # d_model = embedding dimension
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model


        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        # we're separating the last dimention into
        # different heads with one depth
        # Transpose: (batch_size, num_heads, sequence_length, depth)

        x = tf.reshape(x, (batch_size,
                           -1,
                           self.num_heads, 
                           self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):

        batch_size = tf.shape(q)[0]

        # shapes: (batch_size, seq_len, d_model)
        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)

        # shapes: (batch_size, num_heads, seq_len_q, depth)
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        scaled_attention, attention_weights = scaled_dot_product_attention(
    q, k, v, mask)
        
        # transpose and bring it to the normal mode
        scaled_attention = tf.transpose(scaled_attention,
                                        perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(scaled_attention,
                                        (batch_size, -1, self.d_model))
        # shape : (batch_s, seq_len_q, d_model)
        # seq_len_q should be input_seq_len!!!! -----------------
        output = self.dense(concat_attention) 

        return output, attention_weights

In [None]:
temp_mha = MultiHeadAttention(d_model=512, num_heads=8)

y = tf.random.uniform((1, 60, 512)) # (batch_size, encoder_sequence, d_model)
out, attn = temp_mha(y, k=y, q=y, mask=None)
print('output shape: ', out.shape)
print('attention weights shape: ', attn.shape)

In [None]:
def point_wise_feed_forward_network(d_model, d_ff):
    # 2 fully connected
    # one ReLU in between
    return tf.keras.Sequential([
        # shape: (batch_size, seq_len, d_ff)
        tf.keras.layers.Dense(d_ff, activation='relu'), 
        # shape: (batch_size, seq_len, d_model)
        tf.keras.layers.Dense(d_model)
    ])
    

In [None]:
# you may want to change this---------------------------
# 2048 to 1024
sample_ffn = point_wise_feed_forward_network(512, 2048)
# (batch_size, seq_len, d_model)
sample_ffn(tf.random.uniform((64, 50, 512))).shape

# Encoder Layer

In [None]:
# pass through n encoder layers
# decoder attends on its own and the encoders output

class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads,
                 d_ff, rate=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ff_net = point_wise_feed_forward_network(d_model, d_ff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        
    def call(self, x, training, mask):
        
        # shape : (batch_size, input_seq_len, d_model)
        attn_output, _ = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)

        # shape : (batch_size, input_seq_len, d_model)
        out1 = self.layernorm1(x + attn_output)

        # ---------------------------------

        # shape : (batch_size, input_seq_len, d_model)
        ff_net_output = self.ff_net(out1)
        ff_net_output = self.dropout2(ff_net_output, training=training)
        
        # shape : (batch_size, input_seq_len, d_model)
        out2 = self.layernorm2(out1 + ff_net_output)

        return out2

In [None]:
sample_encoder_layer = EncoderLayer(d_model=512,
                                    num_heads=8,
                                    d_ff=2048)

sample_encoder_layer_o = sample_encoder_layer(
    tf.random.uniform((64, 43, 512)),
    training = False,
    mask = None
)

# shape : (batch_size, input_seq_len, d_model)
print('output of encoder layer: shape: ', sample_encoder_layer_o.shape)

# Decoder Layer

In [None]:
class DecoderLayer(tf.keras.layers.Layer):

    def __init__(self, d_model, num_heads, 
                 d_ff, rate=0.1):
        super(DecoderLayer, self).__init__()

        self.mha_1 = MultiHeadAttention(d_model, num_heads)
        self.mha_2 = MultiHeadAttention(d_model, num_heads)

        self.ff_net = point_wise_feed_forward_network(d_model, d_ff)

        self.layernorm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm_3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)


        self.dropout_1 = tf.keras.layers.Dropout(rate)
        self.dropout_2 = tf.keras.layers.Dropout(rate)
        self.dropout_3 = tf.keras.layers.Dropout(rate)
        
    def call(self, x, encoder_output, training,
            look_ahead_mask, padding_mask):
        # encoder_output shape:
        # (batch_s, input_seq_len, d_model)

        # Self Attention ------------------------------

        # attn1 shape: (batch_size, target_seq_len, d_model)
        attention_1, attention_w_block_1 = self.mha_1(x, x, x, look_ahead_mask)
        
        attention_1 = self.dropout_1(attention_1, training=training)
        # residual
        output_1 = self.layernorm_1(attention_1 + x)


        # Causal Attention----------------------------

        # attn2 shape: (batch_size, target_seq_len, d_model)
        attention_2, attention_w_blocks_2 = self.mha_2(
            encoder_output, encoder_output, output_1, padding_mask
        )
        attention_2 = self.dropout_2(attention_2, training=training)

        output_2 = self.layernorm_2(attention_2 + output_1)

        # Feed Forward Network-----------------------------

        # shape:  (batch_size, target_seq_len, d_model)
        ff_output = self.ff_net(output_2)     
        ff_output = self.dropout_3(ff_output, training=training)

        # shape (batch_size, target_seq_len, d_model)
        output_3 = self.layernorm_3(ff_output + output_2)
        
        return output_3, attention_w_block_1, attention_w_blocks_2



In [None]:
sample_decoder_layer = DecoderLayer(d_model=512,
                                    num_heads=8, 
                                    d_ff=2048)

sample_decoder_layer_output, _, _ = sample_decoder_layer(
    tf.random.uniform((64, 50, 512)),
    sample_encoder_layer_o,
    training=False,
    look_ahead_mask=None,
    padding_mask=None
)

 # (batch_size, target_seq_len, d_model)
print('Output shape of our decoder: ', sample_decoder_layer_output.shape)

# Encoder

In [None]:
# input embeddings
# positional encoding
# add them up
# n encoder layers

In [None]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads,
                 d_ff, input_vocab_size,
                 maximum_position_encoding, 
                 rate = 0.1):
    
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embeddings = tf.keras.layers.Embedding(input_vocab_size,
                                                    d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding,
                                                self.d_model)
        self.encoder_layers = [
            EncoderLayer(d_model, num_heads, d_ff, rate) for _ in range(num_layers)
            ]


        self.dropout = tf.keras.layers.Dropout(rate)
    def call(self, x, training, mask):

        seq_len = tf.shape(x)[1]

        # shape: (batch_s, input_seq_len, d_model)
        x = self.embeddings(x) 
        # making it float and them taking the square root
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            # (batch_size, input_seq_len, d_model)
            x = self.encoder_layers[i](x, training, mask)

        return x




In [None]:
sample_encoder = Encoder(num_layers=2, d_model=512, 
                         num_heads=8, d_ff=2048,
                         input_vocab_size=8500,
                         maximum_position_encoding=10000)

temp_input = tf.random.uniform((64, 62),
                               dtype=tf.int64,
                               minval=0, maxval=200
                               )
sample_encoder_output = sample_encoder(temp_input,
                                       training=False,
                                       mask=None)
print('Sample encoder output shape: ', sample_encoder_output.shape)
# (batch_s, input_seq_len, d_model)

# Decoder

In [None]:
# output embedding
# positional encoding
# add them up
# n * decoder layers

In [None]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model,
                 num_heads, d_ff, target_vocab_size,
                 maximum_position_encoding,
                 rate=0.1):
        super(Decoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers

        self.embeddings = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding,
                                                d_model)
        
        self.decoder_layers = [
            DecoderLayer(d_model, num_heads, d_ff, rate) for _ in range(num_layers)
            ]

        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, encoder_output, training, 
             look_ahead_mask, padding_mask):
        
        seq_len = tf.shape(x)[1]

        attention_weights = {}

        # (batch_size, targe_seq_len, d_model)
        x = self.embeddings(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]


        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x, block_1, block_2 = self.decoder_layers[i](x, 
                                                        encoder_output,
                                                        training,
                                                        look_ahead_mask,
                                                        padding_mask)
            attention_weights[f'decoder_layer{i+1}_block_1'] = block_1
            attention_weights[f'decoder_layer{i+1}_block_2'] = block_2

        # shape x : (batch_size, target_seq_len, d_model)
        return x, attention_weights

In [None]:
sample_decoder = Decoder(num_layers=2,
                         d_model=512,
                         num_heads=8,
                         d_ff=2048,
                         target_vocab_size=80000,
                         maximum_position_encoding=5000)

temp_input = tf.random.uniform((64, 26),
                               dtype=tf.int64,
                               minval=0, maxval=200)

output, att = sample_decoder(temp_input,
                             encoder_output=sample_encoder_output,
                             training=False,
                             look_ahead_mask=None,
                             padding_mask=None)

print('Shape of the Decoder output: ', output.shape)

print('Shape of the attention output: ', att['decoder_layer2_block_2'].shape)

# Transformer

In [None]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers,
                 d_model, num_heads,
                 d_ff, input_vocab_size, 
                 target_vocab_size,
                 pe_input, pe_target,
                 rate=0.1):
        super(Transformer, self).__init__()

        self.tokenizer = Encoder(num_layers, d_model, 
                                 num_heads, d_ff, 
                                 input_vocab_size, 
                                 pe_input,
                                 rate)
        self.decoder = Decoder(num_layers, d_model, num_heads,
                               d_ff, target_vocab_size, pe_target,
                               rate)
        # softmax?
        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, input_, target, training,
             encoder_padding_mask,
             look_ahead_mask, decoder_padding_mask):
        
        # shape: (batck_s, inp_seq_len, d_model)
        encoder_output = self.tokenizer(input_, training,
                                        encoder_padding_mask)
        
        decoder_output, attention_weights = self.decoder(
            target, encoder_output, training, look_ahead_mask,
            decoder_padding_mask
        )
        
        final_output = self.final_layer(decoder_output)

        # shape output: (batch_size, seq_len, vocab_size)
        return final_output, attention_weights

In [None]:
sample_transformer = Transformer(
    num_layers=2, d_model=512, num_heads=8,
    d_ff=2048, input_vocab_size=8500,
     target_vocab_size=8000,
    pe_input=1000, pe_target=6000
)

temp_input = tf.random.uniform((64, 38), dtype=tf.int64, minval=0, maxval=200)
temp_target = tf.random.uniform((64, 36), dtype=tf.int64, minval=0, maxval=200)


In [None]:
fn_out, _ = sample_transformer(temp_input,
                               temp_target, 
                               training=False,
                               encoder_padding_mask=None,
                               look_ahead_mask=None,
                               decoder_padding_mask=None)

In [None]:
fn_out.shape

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):

        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg_1 = tf.math.rsqrt(step)
        arg_2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg_1, arg_2)

In [None]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [None]:
temp_learning_rate_schedule = CustomSchedule(d_model)

plt.plot(temp_learning_rate_schedule(tf.range(40000, dtype=tf.float32)))
plt.ylabel("Learning Rate")
plt.xlabel("Train Step")

# Loss

In [None]:
loss_obj = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,
    reduction='none'
)

In [None]:
def loss_function(real, pred):

    # apply a mask first
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_obj(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

In [None]:
def accuracy_function(real, pred):

    #print('accuracy fun 1')
    accuracies = tf.equal(real, tf.argmax(pred,
                                          axis=2,
                                          output_type=tf.int32))
    #print('into accuracy function')
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    accuracies = tf.math.logical_and(mask, accuracies)

    accuracies = tf.cast(accuracies, dtype=tf.float32)
    #print('first casting')
    mask = tf.cast(mask, dtype=tf.float32)
    #print('second casting')
    return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)

In [None]:

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')


In [None]:
num_layers = 4
d_model = 128
d_ff = 512
num_heads = 8
dropout_rate = 0.1

In [None]:
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    d_ff=d_ff,
    input_vocab_size=len(input_lang_tokenizer.word_index)+1,
    target_vocab_size=len(target_lang_tokenizer.word_index)+1,
    pe_input=1000,
    pe_target=1000,
    rate=dropout_rate)

In [None]:
def create_masks(input, target):

    encoder_padding_mask = create_padding_mask(input)

    # for encoder output
    # in the second block of attention
    decoder_padding_mask = create_padding_mask(input)

    # for padding and masking future tokens
    look_ahead_mask = create_look_ahead_mask(tf.shape(target)[1])
    decoder_target_padding_mask = create_padding_mask(target)
    
    combined_mask = tf.maximum(decoder_target_padding_mask, look_ahead_mask)
    return encoder_padding_mask, combined_mask, decoder_padding_mask


In [None]:
"""
# irrelevant-------------------------------------
train_step_signature = [
                        tf.TensorSpec(shape=(None, None), dtype=tf.int64),
                        tf.TensorSpec(shape=(None,None), dtype=tf.int64)
]"""

In [None]:
@tf.function#(input_signature=train_step_signature)
def train_step(input, target):

    target_input = target[:, :-1]
    target_real = target[:, 1:]
    #print('yes')
    encoder_padding_mask,\
    combined_mask, \
    decoder_padding_mask = create_masks(input, target_real)

    with tf.GradientTape() as tape:
         predictions, _ = transformer(input,
                                      target_input,
                                      True, 
                                      encoder_padding_mask,
                                      combined_mask,
                                      decoder_padding_mask)
         
         loss = loss_function(target_real, predictions)

    gradients = tape.gradient(loss,
                              transformer.trainable_variables)
    
    optimizer.apply_gradients(zip(gradients,
                                  transformer.trainable_variables))
    
    train_loss(loss)
    train_accuracy(accuracy_function(target_real, predictions))

In [None]:
gc.collect()

In [None]:
epochs = 13

# augmented n =10,000
# n =1 
for epoch in range(epochs):
    print(colored(f"Epoch {epoch+1}", 'green'))
    start = time.time()

    train_loss.reset_states()
    train_accuracy.reset_states()


    for (batch, (input, target)) in enumerate(train_batches):
        train_step(input, target)

        if batch % 50 == 0:
            print(colored(f'Batch {batch}', 'blue'))
            print(f"Loss {train_loss.result():.4f}\nAccuracy {train_accuracy.result():.4f}")




# Evaluation

In [None]:
#tf.compat.v1.disable_eager_execution()

In [None]:
def evaluate(sent, max_len=40):

    # preprocessing every sentence before giving
    # them to the model
    sentence = process_sents(sent)

    # input tokenizer
    inputs = [input_lang_tokenizer.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                          maxlen=max_len_input,
                                                          padding='post')
    inputs = tf.convert_to_tensor(inputs, dtype=tf.int32)


    output = tf.expand_dims([target_lang_tokenizer.word_index['<start>']],
                            0)
    #print('yesyes')
    for i in range(max_len):
        #print('yeaaaa')
        encoder_padding_mask, combined_mask, decoder_padding_mask=\
        create_masks(inputs, output)
        #print('yeaaaaaaaa')
        # shape pred : (batch_s, seq_tar_len, vocab size)

        predictions, attention_ws = transformer(inputs,
                                            output,
                                            False,
                                            encoder_padding_mask,
                                            combined_mask,
                                            decoder_padding_mask)
        # shape pred we want: last token
        # (batch_s, 1, vocab)
        #print('aha')
        predictions = predictions[:, -1:, :]

        predicted_id = tf.argmax(predictions,
                                 axis=-1, 
                                 output_type=tf.int32)
        #print('aha again')
        output = tf.concat([output, predicted_id], 
                           axis=-1)

        #print('it has been concatenated')
        if predicted_id == target_lang_tokenizer.word_index['<end>']:
            #print('broken')
            break

        
    output_txt = ''
    for i in output[0].numpy():
        #print('in the for loop')
        #print(i)
        #print(target_lang_tokenizer.index_word[i])
        output_txt = output_txt + target_lang_tokenizer.index_word[i] + ' '

    return output_txt, output_txt.split(' '), attention_ws


In [None]:
def print_poetry(sent):

    poetry, tokens, att_weights = evaluate(sent)
    print(colored('Text: ', 'green'), sent)
    print(colored('Poetry: ', 'green'), poetry)
    

In [None]:
print_poetry('با این توصیف عشاق بی عقل و بدون هدف خاص زندگی می کنند و دارای هیچ هدف و مغزی نیستند تا اینکه به جهنم می رسند و به هیچ جایگاه دنیوی و واقعی دست پیدا نمی کنند')

In [None]:
print_poetry('با این توصیف عشاق بی عقل و بدون هدف خاص زندگی می کنند و دارای هیچ هدف و مغزی نیستند تا اینکه به جهنم می رسند و به هیچ جایگاه دنیوی و واقعی دست پیدا نمی کنند') # 10

In [None]:
print_poetry('چه خوش است دردی که یار را برای عیادت بر سر بالینم آور')

In [None]:
print_poetry('چه خوش است دردی که یار را برای عیادت بر سر بالینم آور')

In [None]:
print_poetry('تحمل مشکلات و سختی ها در این دنیا شما را به مقام والا و عشق و هدف حقیقی در پیش معشوق می رساند.') # 12 ایپاک 

In [None]:
print_poetry('تحمل مشکلات و سختی ها در این دنیا شما را به مقام والا و عشق حقیقی در پیش معشوق می رساند.') # 12 ایپاک 

In [None]:
print_poetry('چه خوش است دردی که یار را برای عیادت بر سر بالینم آور')

In [None]:
print_poetry('ای مردم اینک دور دور عیسی است اسرار دین او را با دل و جان گوش دهید.')

In [None]:
print_poetry('با این توصیف عشاق بی عقل و بدون هدف خاص زندگی می کنند و دارای هیچ هدف و مغزی نیستند تا اینکه به جهنم می رسند و به هیچ جایگاه دنیوی و واقعی دست پیدا نمی کنند')

# Evaluate and output a dataset

In [None]:
len(all_data)

In [None]:
def clean(t):
    t = re.sub('^ ', '', t)
    t = re.sub(' $', '', t)
    t = re.sub(r' */ *', ' / ', t)

    t = re.sub(r' \. \.', '\.', t)
    t = re.sub(' +\s', ' ', t)

    t = re.sub(' \.$', '\.', t)
    t = re.sub('^ *\. *', '', t)

    t = re.sub('[۱۲۳۴۵۶۷۸۹۰]', '', t)
    
    return t

In [None]:
all_data.loc[:, 'poetry'] = all_data.loc[:, 'poetry'].apply(lambda x: clean(x))
all_data.loc[:, 'text'] = all_data.loc[:, 'text'].apply(lambda x: clean(x))

In [None]:
print(all_data.loc[0, 'poetry'])

In [None]:
all_data.loc[input_tensor_val.index]#.isna().sum()

In [None]:
def evaluate_dataset(df, max_len=40):

    generated_text = []
    df = df.reset_index(drop=True)


    for r in range(len(df)):

        try:
            
            #print(r)
            # preprocessing every sentence before giving
            # them to the model
            sentence = process_sents(df.loc[r, 'text'])
            # input tokenizer
            inputs = [input_lang_tokenizer.word_index[i] for i in sentence.split(' ')]
            inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                                maxlen=max_len_input,
                                                                padding='post')
            # print(inputs)
            inputs = tf.convert_to_tensor(inputs, dtype=tf.int32)
            # print(inputs)

            output = tf.expand_dims([target_lang_tokenizer.word_index['<start>']],
                                    0)
            #print('yesyes')
            for i in range(max_len):
                #print('yeaaaa')
                encoder_padding_mask, combined_mask, decoder_padding_mask=\
                create_masks(inputs, output)
                #print('yeaaaaaaaa')
                # shape pred : (batch_s, seq_tar_len, vocab size)

                predictions, attention_ws = transformer(inputs,
                                                    output,
                                                    False,
                                                    encoder_padding_mask,
                                                    combined_mask,
                                                    decoder_padding_mask)
                # shape pred we want: last token
                # (batch_s, 1, vocab)
                #print('aha')
                predictions = predictions[:, -1:, :]

                predicted_id = tf.argmax(predictions,
                                        axis=-1, 
                                        output_type=tf.int32)
                #print('aha again')
                output = tf.concat([output, predicted_id], 
                                axis=-1)

                #print('it has been concatenated')
                if predicted_id == target_lang_tokenizer.word_index['<end>']:
                    #print('broken')
                    break

                
            output_txt = ''
            for i in output[0].numpy():
                #print('in the for loop')
                #print(i)
                #print(target_lang_tokenizer.index_word[i])
                output_txt = output_txt + target_lang_tokenizer.index_word[i] + ' '

            generated_text.append(output_txt)


            

        except: 
            print(r)
            print(df.loc[r, 'text'])

            generated_text.append(None)


    df_output = pd.concat([df, pd.Series(generated_text)],
                                axis = 1)
            
    df_output.columns = ['poetry_ground_truth',
                        'text',
                        'poetry_generated_MHA']
                        
    return df_output, generated_text


In [None]:
output_df, generated_text = evaluate_dataset(all_data.loc[val_indices][:100], max_len=40)
output_df

In [None]:
output_df.to_csv(f'.../Results/MultiHeadAttention_Poetry_19_withAugmented_P&T.csv',
                 index=False)

# *Experiment*

In [None]:
output = output[[len(i)>70 for i in output.text]]

In [None]:
output_df[[len(i)>80 for i in output_df.text]]

In [None]:
len('توبه باید با اب و محبت همراه باشد')

In [None]:
all_data.poetry[0]

In [None]:
for i in range(len(output_df)):
    if len(output_df.text[i])<50:

        print()
        print(colored(i, 'blue'))
        print(output_df.text[i])
        print(colored('generated: ', 'green'), output_df.poetry_generated_MHA[i])
        print(colored('gt: ', 'green'), output_df.poetry_ground_truth	[i])

In [None]:
eval_ = all_data.loc[target_tensor_val.index].reset_index(drop=True)

In [None]:
print(eval_.loc[32, 'text'])

In [None]:
output_df.to_csv('.../Results/MultiHeadAttention_8_20Epochs_Poetry_17_cleaned_data.csv',
                 index=False)

# run again

In [None]:
for i in range(len(output_df)):
    print('\nground truth: ', output_df.loc[i,'poetry_ground_truth'])
    print('generated: ', output_df.loc[i, 'poetry_generated_MHA'])

In [None]:
output_df, generated_text = evaluate_dataset(all_data.loc[target_tensor_val.index], max_len=40)
output_df

In [None]:
for record in all_data.loc[target_tensor_val.index].iterrows():
    print(record[0])

In [None]:
output_df.columns = ['poetry_generated_MHA', # modified this
                    'text',
                    'poetry_generated']

In [None]:
output_df.to_csv('.../Results/MultiHeadAttention.csv',
                 index=False)

In [None]:
output_df.to_csv('.../Results/MultiHeadAttention_Poetry_17.csv',
                 index=False)

In [None]:
sentences = []

for i in input_tensor_val:
    output = ''
    for j in i:
        if j ==0:
            break

        output = output + input_lang_tokenizer.index_word[j] + ' '

    sentences.append(output)