In [None]:
!pip install hazm

In [None]:
import collections

import helper
import numpy as np
from sklearn.model_selection import train_test_split

import string
import unicodedata
import re
import numpy as np
import os
import io
import time
import pandas as pd
import hazm


import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, Dropout, LSTM
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
import tensorflow as tf
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_colwidth', None)
from termcolor import colored
from itertools import chain
#from transformers import BertTokenizer, BertModel
who_am_i = 'Mitra'



In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
who_am_i = 'Mitra'


all_data = pd.read_csv('.../final/ProsPoemParallelDataset_augmented.csv')

print('length of augmented cleaned data: ', 
      colored(len(all_data), 'blue'))
val_indices = pd.read_pickle('.../validation_indices.pickle')
train_indices = pd.read_pickle('.../train_indices.pickle')


def clean(t):
    t = re.sub('^ ', '', t)
    t = re.sub(' $', '', t)
    t = re.sub(r' */ *', ' / ', t)
    t = t.replace('\\', '')
    t = re.sub(r' \. *\.', '\.', t)
    t = re.sub(' +\s', ' ', t)

    t = re.sub(' \.$', '\.', t)
    t = re.sub('^ *\. *', '', t)

    t = re.sub('[۱۲۳۴۵۶۷۸۹۰]', '', t)
    
    return t

all_data.loc[:, 'poetry'] = all_data.loc[:, 'poetry'].apply(lambda x: clean(x))
all_data.loc[:, 'text'] = all_data.loc[:, 'text'].apply(lambda x: clean(x))


all_data.reset_index(inplace=True, drop=True)
all_data.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
validation_set = all_data.loc[val_indices]
train_set = all_data.loc[train_indices]

In [None]:
normalizer = hazm.Normalizer(persian_numbers=False)

def process_sents(text):
    
    # separate dot or / from text with
    # one white space
    text = normalizer.normalize(text)

    text = re.sub(r'([\/\.])', r' \1', text)

    # substitute / with sep between mesras
    text = re.sub(r' *\/ *', ' <sep> ', text)
    
    # substitute any white space with one space
    text = re.sub(r'\s+', ' ', text)
    
    # add start and end tokens
    text = '<start> ' + text + ' <end>'
    
    return text


In [None]:
def tokenize(lang, target=True, max_len=35):
    # use keras defualt tokenizer
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
        filters=''
    )
    # fit on the vocabulary used in text
    lang_tokenizer.fit_on_texts(lang)

    # convert to ids
    tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                           padding = 'post',
                                                           maxlen=max_len)
        


    return tensor, lang_tokenizer

In [None]:
def create_load_dataset(df):

    input_lang = df.loc[:, 'text'].values.tolist()
    target_lang = df.loc[:, 'poetry'].values.tolist()

    # preprocess each sentence
    input_lang = [process_sents(text) for text in input_lang]
    target_lang = [process_sents(text) for text in target_lang]

    # create a tensor and tokenizer for each language
    input_tensor, input_lang_tokenizer = tokenize(input_lang)
    max_len_input = input_tensor.shape[1]

    target_tensor, target_lang_tokenizer = tokenize(target_lang, target=True,
                                                    max_len=max_len_input)

    return input_tensor, target_tensor, input_lang_tokenizer, target_lang_tokenizer

In [None]:
input_tensor, target_tensor,\
input_lang_tokenizer, target_lang_tokenizer = create_load_dataset(all_data)


In [None]:
max_len_input = input_tensor.shape[1]
max_len_target = target_tensor.shape[1]

print('longest sequence and the length of texts: ',
      colored(max_len_input, 'blue'))
print('longest sequence and the length of poetries: ',
      colored(max_len_target, 'blue'))

In [None]:
# lenght of constructed vocabularies:
# 1 for padding
vocab_len_i = len(input_lang_tokenizer.index_word) + 1
print("Plain text vocab has", colored(f"{vocab_len_i:,}", 'green'), "unique words.")

vocab_len_t = len(target_lang_tokenizer.index_word) + 1
print(f"Poetry vocab has", colored(f"{vocab_len_t:,}", 'green'), "unique words.")


In [None]:
def convert(text, poetry):


    print(colored('Text:', 'green'))
    for i in text:
        if i!=0:
            print("%d -----> %s"%(i, input_lang_tokenizer.index_word[i]))
        
    print(colored('\nPoetry:', 'green'))
    for i in poetry:
        if i!=0:
            print("%d -----> %s"%(i, target_lang_tokenizer.index_word[i]))

In [None]:
print(colored('Text: ', 'blue'), all_data.loc[5, 'text'])
print(colored('Poetry: ', 'blue'), all_data.loc[5, 'poetry'])
convert(input_tensor[5], target_tensor[5])

# Seq to seq model with embedding in the beginning

In [None]:

def seq_2_seq(input_vocab_s, output_vocab_s, embedding_dim,
              gru_d=256, drop_out = 0.5, l_rate=0.002):

    model = keras.Sequential()
    
    model.add(layers.Embedding(input_dim=input_vocab_s, output_dim=embedding_dim))

    # The output of GRU will be a 3D tensor of shape (batch_size, timesteps, 256)
    model.add(layers.GRU(gru_d, return_sequences=True))

    model.add(layers.Dropout(rate=drop_out))

    # The output of SimpleRNN will be a 2D tensor of shape (batch_size, 128)
    model.add(layers.Dense(output_vocab_s, activation='softmax'))


    model.compile(
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer=tf.keras.optimizers.SGD(
    learning_rate=l_rate),
        metrics=["accuracy"],
)
    return model


In [None]:
model = seq_2_seq(vocab_len_i, vocab_len_t, embedding_dim=1024,
              gru_d=1024, drop_out = 0.5, l_rate=0.001)

In [None]:
model.summary()

In [None]:
np.exp(1.34)

In [None]:
input_tensor

In [None]:
from sklearn.model_selection import KFold


epochs = 5
n_splits = 3
batch_s = 64

for epoch in range(epochs):

    print(colored(f"Epoch {epoch+1}", 'green', attrs=['bold', 'underline']))


    start = time.time()
    for i, (train_index, test_index) in enumerate(KFold(n_splits).split(pd.Series(input_tensor.tolist()))):

        print(colored(f'fold {i+1}', 'green'))

        
        # first defining the train and val based on kfold splits
        input_tensor_train, input_tensor_val = input_tensor[train_index], input_tensor[test_index]
        target_tensor_train, target_tensor_val = target_tensor[train_index], target_tensor[test_index]



        model.fit(x = input_tensor_train,
          y = target_tensor_train,
          validation_data=(input_tensor_val,target_tensor_val ) ,  
          batch_size = batch_s)





In [None]:

def evaluate(sent, max_len=40):

    # preprocessing every sentence before giving
    # them to the model
    sentence = process_sents(sent)
    print(sentence)
    # input tokenizer
    inputs = [input_lang_tokenizer.word_index[w] for w in sentence.split(' ')]

    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                          maxlen=max_len_input,
                                                          padding='post')
    
    inputs = tf.convert_to_tensor(inputs, dtype=tf.int32)

    pred = model.predict(inputs)
    print(pred)
    prediction = tf.argmax(pred, axis=1, output_type=tf.int32)
    #print('prediction:', prediction)
    #print(prediction[0].numpy())
    #print(len(prediction[0]))
    pred_ = prediction.numpy()[0]
    output = ''
    for m in range(len(pred_)):
        #print(i)
        if  pred_[m]== target_lang_tokenizer.word_index['<end>']:
            break
        elif  pred_[m]==0:
                    break
        elif pred_[m] == pred_[m-1]: continue
        else: output = output + ' ' + target_lang_tokenizer.index_word[pred_[m]]
        
    
    

    return output

def print_poetry(sent):

    poetry = evaluate(sent)
    print(colored('Text: ', 'green'), sent)
    print(colored('Poetry: ', 'green'), poetry)

In [None]:
def evaluate_dataset(df):

    generated_p = []

    df = df.reset_index(drop=True)
    for r in range(len(df)):

        # preprocessing every sentence before giving
        # them to the model
        sentence = process_sents(df.loc[r, 'text'])
        #print(sentence)
        # input tokenizer
        inputs = [input_lang_tokenizer.word_index[w] for w in sentence.split(' ')]

        inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                            maxlen=max_len_input,
                                                            padding='post')
        
        inputs = tf.convert_to_tensor(inputs, dtype=tf.int32)

        pred = model.predict(inputs)
        
        prediction = tf.argmax(pred, axis=1, output_type=tf.int32)
        #print('prediction:', prediction)
        #print(prediction[0].numpy())
        #print(len(prediction[0]))
        #print(prediction)
        
        pred_ = prediction.numpy()[0]
        print(prediction)
        
        if r==0:print(pred_)

        #print(len(pred_))

        output = ''

        for m in range(len(pred_)):
            #print(i)
            #if  pred_[m]== target_lang_tokenizer.word_index['<end>'] or pred_[m] == target_lang_tokenizer.word_index['<start>']:
                #break
            if  pred_[m]==0 :break
            if  pred_[m]== target_lang_tokenizer.word_index['<end>'] :
                break
            #elif pred_[m] == pred_[m-1]: continue
            #elif pred_[m] == pred_[m-2]: continue
            output = output + ' ' + target_lang_tokenizer.index_word[pred_[m]]
        
        generated_p.append(output)
        
        
    df_output = pd.concat([df, pd.Series(generated_p)], axis = 1)

    df_output.columns = ['poetry_ground_truth',
                        'text',
                        'poetry_generated_Seq2Seq_GRU']


    return df_output

In [None]:
df_output = evaluate_dataset(all_data.loc[:100])

In [None]:
df_output

In [None]:
df_output.to_csv(f'.../Results/Phase|Models/Seq2Seq_with_GRU_epochs_{batch_size}_batch_size_{drop_out_r}_drop_out_r_{embedding_dim}_embedding_dim_{depth}_depth_{l_rate}_l_rate.csv',
                 index=False)

In [None]:
sys_inp = 'عاشقان در این زندگی به دنبال جایگاه دنیوی نیستند. آن ها این دنیا را پست و بی مقدار و خار می دانند.'

print_poetry(sys_inp)