# Packages

In [None]:
!pip install hazm

In [None]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import gc
import string
import unicodedata
import re
import numpy as np
import os
import io
import time
import pandas as pd
import hazm
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_colwidth', None)
from termcolor import colored
from itertools import chain
#from transformers import BertTokenizer, BertModel
who_am_i = 'Mitra'

# Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
all_data = pd.read_csv('.../ProsPoemParallelDataset_augmented.csv')

print('length of augmented cleaned data: ', 
      colored(len(all_data), 'blue'))

In [None]:
val_indices = pd.read_pickle('.../validation_indices.pickle')
train_indices = pd.read_pickle('.../train_indices.pickle')

In [None]:
def clean(t):
    t = re.sub('^ ', '', t)
    t = re.sub(' $', '', t)
    t = re.sub(r' */ *', ' / ', t)
    t = t.replace('\\', '')
    t = re.sub(r' \. *\.', '\.', t)
    t = re.sub(' +\s', ' ', t)

    t = re.sub(' \.$', '\.', t)
    t = re.sub('^ *\. *', '', t)

    t = re.sub('[۱۲۳۴۵۶۷۸۹۰]', '', t)
    
    return t

all_data.loc[:, 'poetry'] = all_data.loc[:, 'poetry'].apply(lambda x: clean(x))
all_data.loc[:, 'text'] = all_data.loc[:, 'text'].apply(lambda x: clean(x))

In [None]:
all_data.head(2)

# PreProcessing + Creating Inputs

In [None]:
normalizer = hazm.Normalizer(persian_numbers=False)

def process_sents(text):
    
    # separate dot or / from text with
    # one white space
    text = normalizer.normalize(text)

    text = re.sub(r'([\/\.])', r' \1', text)

    # substitute / with sep between mesras
    text = re.sub(r' *\/ *', ' <sep> ', text)
    
    # substitute any white space with one space
    text = re.sub(r'\s+', ' ', text)
    
    # add start and end tokens
    text = '<start> ' + text + ' <end>'
    
    return text


In [None]:
def tokenize(lang):
    # use keras defualt tokenizer
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
        filters=''
    )
    # fit on the vocabulary used in text
    lang_tokenizer.fit_on_texts(lang)

    # convert to ids
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                           padding = 'post')
    
    # add sep to the tokenizer
    #idx_sep = len(lang_tokenizer.index_word.keys())+1#[-1]

    #lang_tokenizer.word_index['<sep>'] = idx_sep
    #lang_tokenizer.index_word[idx_sep] = '<sep>'


    return tensor, lang_tokenizer

In [None]:
def create_load_dataset(df):

    input_lang = df.loc[:, 'text'].values.tolist()
    target_lang = df.loc[:, 'poetry'].values.tolist()

    # preprocess each sentence
    input_lang = [process_sents(text) for text in input_lang]
    target_lang = [process_sents(text) for text in target_lang]

    # create a tensor and tokenizer for each language
    input_tensor, input_lang_tokenizer = tokenize(input_lang)
    target_tensor, target_lang_tokenizer = tokenize(target_lang)

    return input_tensor, target_tensor, input_lang_tokenizer, target_lang_tokenizer

In [None]:
input_tensor, target_tensor,\
input_lang_tokenizer, target_lang_tokenizer = create_load_dataset(all_data)


In [None]:
max_len_input = input_tensor.shape[1]
max_len_target = target_tensor.shape[1]

print('longest sequence and the length of texts: ',
      colored(max_len_input, 'blue'))
print('longest sequence and the length of poetries: ',
      colored(max_len_target, 'blue'))

# Vocabularies

In [None]:
# lenght of constructed vocabularies:
# 1 for padding
vocab_len_i = len(input_lang_tokenizer.index_word) + 1
print("Plain text vocab has", colored(f"{vocab_len_i:,}", 'green'), "unique words.")

vocab_len_t = len(target_lang_tokenizer.index_word) + 1
print(f"Poetry vocab has", colored(f"{vocab_len_t:,}", 'green'), "unique words.")



In [None]:
def convert(text, poetry):


    print(colored('Text:', 'green'))
    for i in text:
        if i!=0:
            print("%d -----> %s"%(i, input_lang_tokenizer.index_word[i]))
        
    print(colored('\nPoetry:', 'green'))
    for i in poetry:
        if i!=0:
            print("%d -----> %s"%(i, target_lang_tokenizer.index_word[i]))

In [None]:
print(colored('Text: ', 'blue'), all_data.loc[5, 'text'])
print(colored('Poetry: ', 'blue'), all_data.loc[5, 'poetry'])
convert(input_tensor[5], target_tensor[5])

# Creating the Model

In [None]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val =\
input_tensor[train_indices], input_tensor[val_indices],  target_tensor[train_indices], target_tensor[val_indices]

print('Length of train and val:', 
      colored(f"{len(input_tensor_train), len(input_tensor_val)}", 'blue'))

In [None]:
# defining the main parameters of the model
# and the inputs

len_data = len(input_tensor_train)
batch_s = 64
steps_per_
 = len_data // batch_s
embedding_dim = 256
units = 1024

In [None]:
# create the dataset and shuffle all
len_data_train = len(input_tensor_train)
len_data_test = len(target_tensor_val)

# creat the datasets and put them in batches

train_batches = tf.data.Dataset.from_tensor_slices((
    np.array(input_tensor_train.tolist(), dtype='int32'),
     np.array(target_tensor_train.tolist(), dtype='int32')
)).shuffle(len_data_train).batch(batch_s, drop_remainder=True)



In [None]:
input_batch_sample, target_batch_sample = next(iter(train_batches))

print('A sample of text(input) batch: \n', 
      colored(input_batch_sample, 'blue'))

print('\nA sample of poetry(target) batch: \n', 
      colored(target_batch_sample, 'blue'))



In [None]:
class Encoder(tf.keras.Model):

    def __init__(self, vocab_size, embedding_dim,
                 enc_units, batch_s, ):
        
        # change the primary model from keras
        super(Encoder, self).__init__()

        # define the parameters
        self.batch_s = batch_s
        self.enc_units = enc_units
        self.embeddings = tf.keras.layers.Embedding(vocab_size,
                                                    embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        
    
    def call(self, x, hidden):
        # first part of the model
        # calling the embeddings and giving them
        # to the gru
        x = self.embeddings(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state


    def initialize_hidden_state(self):
        # the initial state of the hidden states
        # start with zeros
        return tf.zeros((self.batch_s, self.enc_units))

In [None]:
encoder = Encoder(vocab_len_i, embedding_dim, units, batch_s)

In [None]:
# initialize the hs
sample_hidden_states = encoder.initialize_hidden_state()
# get the output of the encoder
sample_encoder_output, sample_hidden_states_encoder = encoder(input_batch_sample, sample_hidden_states)

print('Encoder hidden states shapes:',
      colored(sample_hidden_states_encoder.shape, 'blue'))
print('Encoder output shape:',
      colored(sample_encoder_output.shape, 'blue'))

In [None]:
print('Hidden states after being processed in gru:\n',
      colored(sample_hidden_states_encoder, 'blue'))

print('\nEncoder output sample:\n', colored(sample_encoder_output, 'blue'))

In [None]:
class Decoder(tf.keras.Model):
    # create the decoder side

    def __init__(self, vocab_size, embedding_dim, 
                 decoder_units, batch_s, ):
        
        # take and change the keras.model
        super(Decoder, self).__init__()
        # parameters
        self.batch_s = batch_s
        self.decoder_units = decoder_units
        self.embeddings = tf.keras.layers.Embedding(vocab_size, embedding_dim)

        self.gru = tf.keras.layers.GRU(self.decoder_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        # fully connected
        self.fc = tf.keras.layers.Dense(vocab_size, 
                                        activation='softmax')

        self.attention = BahdanauAttention(self.decoder_units)

    def call(self, x, hidden, encoder_output):
        # construct the decoder

        # x shape = (batch_s, 1)
        # one id for each word from the target
        
        # encoder output = (batch_s, max_len, hidden_states_s)
        context_vector = self.attention(query=hidden, 
                                                           value=encoder_output)
        
        # we expand the ids into embedding vectors
        # x = (batch_s, 1, embedding_dim)
        x = self.embeddings(x)

        # concatenating hidden states and the context
        # vector
        x = tf.concat([tf.expand_dims(context_vector, 1), x],
                      axis=-1)
        
        # give both attention and embeddings to gru
        output, state = self.gru(x)

        # output = (batch_size, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # batch_s, vocab_size
        x = self.fc(output)

        return x, state



In [None]:
class BahdanauAttention(tf.keras.layers.Layer):
    
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()

        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, value):
        # query = batch_s, hidden_states - from decoder
        # query_with_time = batch_s, 1, hidden_states
        # values = batch_s, max_len_input, hidden_states

        # adding one dimention to take time into account
        query_with_time_axis = tf.expand_dims(query, 1)


        # combining values and queries
        # attention scores = batch_s, max_len, 1
        attention_score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(value)
        ))

        # getting a softmax to choose the weights for
        # each position in input
        # batch_s, max_len_input, 1
        attention_weights = tf.nn.softmax(attention_score, axis=1)

        # after multiplication and summing:
        # context_vector = batch_s, hidden_s
        context_vector = attention_weights * value
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector

In [None]:
decoder = Decoder(vocab_len_t, embedding_dim, units, batch_s)


sample_decoder_output, states = decoder(x = tf.random.uniform((batch_s, 1)), 
                                        hidden = sample_hidden_states_encoder,
                                        encoder_output = sample_encoder_output)

# Train

In [None]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,
    reduction='none'
)

def loss_function(real, pred):

    # first mask the ones that are not paddings
    mask = tf.math.logical_not(tf.math.equal(real, 0))

    # apply the loss on the whole sequence
    loss_ = loss_object(real, pred)

    # make the mask datatype the same as loss
    mask = tf.cast(mask, dtype=loss_.dtype)

    loss_ = loss_ * mask

    # return the mean of all words
    return tf.reduce_mean(loss_)


In [None]:


# Switching from eager execution to graph execution

@tf.function
def train_step(input, target, encoder_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        # get to the encoder
        encoder_output, encoder_hidden_states = encoder(input, encoder_hidden)

        # start the decoder
        decoder_hidden_states = encoder_hidden_states

        # give <s> to all as the first word
        decoder_input = \
        tf.expand_dims([target_lang_tokenizer.word_index['<start>']] * batch_s, 1)

    # Teacher Forcing 
    # start from the first word and continue
    # till the end of the sequence

        for t in range(1, target.shape[1]):

            # give 
            # 1. decoder input that starts with <start>
            # 2. decoder hidden states
            # 3. encoder output
            # to the decoder

            Seq2Seq_logits, decoder_hidden_states = decoder(
                decoder_input, decoder_hidden_states, encoder_output
            )

            # ------------------------------------
            # calculate the loss at time step t
            loss += loss_function(target[:, t], Seq2Seq_logits)

            # change the decoder input to the target token of 
            # this time step for 
            # Teacher Forcing
            decoder_input = tf.expand_dims(target[:, t], axis=1)
        
    # get the mean loss
    batch_loss = (loss / int(target.shape[1]))

    # get the variables that have been changed
    variables = encoder.trainable_variables + decoder.trainable_variables

    # calculate the gradients based on the loss
    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [None]:
gc.collect()

In [None]:
# alpha = 0.6
epochs = 13 
start_id = '<start>'

for epoch in range(epochs):

    print("epoch ", colored(epoch, 'blue'))
    start = time.time()

    encoder_hidden_state = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (input, target)) in enumerate(train_batches.take(steps_per_epoch)):

        batch_loss = train_step(input, target, encoder_hidden_state)
        total_loss += batch_loss

        gc.collect()
        
        if batch%50==0:
            print('batch ', colored(batch, 'green'),
                  f' Loss {batch_loss.numpy():.4f}')

    #checkpoint.save(file_prefix=checkpoint_prefix)
    print(f'Time taken: {time.time() - start:.2f} seconds')

# Normal Evaluation

In [None]:
def evaluate(sentence):


    # attention_plot = np.zeros((max_len_target,
      #                         max_len_input))

    # preprocessing every sentence before giving
    # them to the model
    sentence = process_sents(sentence)
    # converting str to ids and padding and creating a tensor
    # from all
    inputs = [input_lang_tokenizer.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                          maxlen=max_len_input,
                                                          padding='post')
    inputs = tf.convert_to_tensor(inputs)

    # start constructing the output string
    output = ''

    hidden_state = [tf.zeros((1, units))]
    encoder_output, encoder_hidden_state = encoder(inputs, hidden_state)

    decoder_hidden_state = encoder_hidden_state

    decoder_input = tf.expand_dims([target_lang_tokenizer.word_index['<start>']], 0)


    for t in range(max_len_target):
        predictions, decoder_hidden_state = decoder(
            decoder_input, decoder_hidden_state,
            encoder_output
        )


        # sotring for plot
        # attention_weights = tf.reshape(attention_weights, (-1, ))
        # attention_plot[t] = attention_weights.numpy()
        # predict the most probable token
        predicted_id = tf.argmax(predictions[0]).numpy()
        
        # add this token to the previous ones
        output = output + target_lang_tokenizer.index_word[predicted_id] + ' '

        # is it over?
        if target_lang_tokenizer.index_word[predicted_id] == '<end>':
            return output, sentence

        # give the prediction to continue predicting 
        # next tokens
        decoder_input = tf.expand_dims([predicted_id], 0)


    return output, sentence

In [None]:
def transform(text):
    output, text = evaluate(sentence=text)

    print('Text: ', text)
    print('Generate poetry:', output)


    # attention_plot = attention_plot[:len(output.split(' ')),
     #                             :len(text.split(' '))]
    # plot_attention(attention_plot, text.split(' '), output.split(' '))

In [None]:
transform('با این توصیف عشاق بی عقل و بدون هدف خاص زندگی می کنند و دارای هیچ هدف و مغزی نیستند تا اینکه به جهنم می رسند و به هیچ جایگاه دنیوی و واقعی دست پیدا نمی کنند')  

# evaluate a dataset

In [None]:
def evaluate_dataset(df):

    generated_p = []

    df = df.reset_index(drop=True)
    for r in range(len(df)):

        try:
            # attention_plot = np.zeros((max_len_target,
            #                         max_len_input))

            # preprocessing every sentence before giving
            # them to the model
            sentence = process_sents(df.loc[r, 'text'])
            # converting str to ids and padding and creating a tensor
            # from all
            inputs = [input_lang_tokenizer.word_index[i] for i in sentence.split(' ')]
            inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                                maxlen=max_len_input,
                                                                padding='post')
            inputs = tf.convert_to_tensor(inputs)

            # start constructing the output string
            output = ''

            hidden_state = [tf.zeros((1, units))]
            encoder_output, encoder_hidden_state = encoder(inputs, hidden_state)

            decoder_hidden_state = encoder_hidden_state

            decoder_input = tf.expand_dims([target_lang_tokenizer.word_index['<start>']], 0)


            for t in range(max_len_target):
                predictions, decoder_hidden_state = decoder(
                    decoder_input, decoder_hidden_state,
                    encoder_output
                )


                # sotring for plot
                # attention_weights = tf.reshape(attention_weights, (-1, ))
                # attention_plot[t] = attention_weights.numpy()
                # predict the most probable token
                predicted_id = tf.argmax(predictions[0]).numpy()
                
                # add this token to the previous ones
                output = output + target_lang_tokenizer.index_word[predicted_id] + ' '

                # is it over?
                if target_lang_tokenizer.index_word[predicted_id] == '<end>':
                    break

                # give the prediction to continue predicting 
                # next tokens
                decoder_input = tf.expand_dims([predicted_id], 0)

            generated_p.append(output)

        except: 
            print(r)
            print(df.loc[r, 'text'])

            generated_p.append(None)

    df_output = pd.concat([df, pd.Series(generated_p)],
                                axis = 1)
            
    df_output.columns = ['poetry_ground_truth',
                        'text',
                        'poetry_generated_Seq2Seq_with_Att']


    return df_output

In [None]:
val_indices

In [None]:

output_df = evaluate_dataset(all_data.loc[val_indices])
output_df

In [None]:
output_df.to_csv(f'.../Results/Phase|Models/Seq2Seq_with_att_{epochs}_epochs_{batch_s}_batch_s_{embedding_dim}_embedding_dim_{units}_units_.csv',
                 index=False)