In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from tensorflow.keras.layers import ELU, Embedding, Dense, Dropout, Input, Layer, LayerNormalization
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.initializers import GlorotUniform
from tensorflow.keras.optimizers import SGD, RMSprop, Adam
from tensorflow.keras import regularizers, losses
from tensorflow.keras import backend as K

%tensorflow_version 2.x

import os
import datetime
import random
import sys

tf.compat.v1.disable_eager_execution()

Using TensorFlow backend.


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [3]:
# import dataset from txt file

X = np.array(())

DataSetPath = '/content/gdrive/My Drive/bsc/Final/DataSet/ferdousi_norm.txt'

dataset = pd.read_csv(DataSetPath, na_values=" ", header=None)
temp = dataset.iloc[:, 0].values

X = np.append(X, temp)

In [4]:
print(X[:3])
print(X[-1])
print(X[-2])

['به نام خداوند جان و خرد' 'کزین برتر اندیشه برنگذرد'
 'خداوند نام و خداوند جای']
هر آنکس که دارد هش و رای و دین
که تخم سخن من پراگنده ام


In [5]:
print("The total count of masraas is " , len(X))

The total count of masraas is  99217


In [6]:
# Train on just a quarter of data
X = X[: int(len(X) / 4) ]

if len(X) % 2 == 1:
    X = X[:-1]

In [7]:
print("The total count of masraas is " , len(X))

The total count of masraas is  24804


In [8]:
verses = []
input_verses = []
output_verses = []

for i in range(len(X)):
    if i % 2 == 0:
        input_verses.append('<sos> ' + X[i] + ' <middle> ' + X[i + 1])
        output_verses.append(X[i] + ' <middle> ' + X[i + 1] + ' <eos>')
        verses.append('<sos> ' + X[i] + ' <middle> ' + X[i + 1] + ' <eos>')

In [9]:
print("The total number of verses is ", len(verses))

The total number of verses is  12402


In [10]:
# tokenize all data

tokenizer = Tokenizer(filters="")
tokenizer.fit_on_texts(verses)

TokenizedVerses = tokenizer.texts_to_sequences(verses)
tokenized_input = tokenizer.texts_to_sequences(input_verses)
tokenized_output = tokenizer.texts_to_sequences(output_verses)

In [11]:
word2index = tokenizer.word_index
index2word = tokenizer.index_word

In [12]:
# find max length for padding

max_len = 0

for i in TokenizedVerses:
    if len(i) > max_len:
        sen = []
        for t in i :
            sen.append(index2word[t])
        print(sen)
    max_len = max(max_len, len(i)) 

max_len -= 1
print("maximum length of sentences : ", max_len)


['<sos>', 'به', 'نام', 'خداوند', 'جان', 'و', 'خرد', '<middle>', 'کزین', 'برتر', 'اندیشه', 'برنگذرد', '<eos>']
['<sos>', 'خداوند', 'کیوان', 'و', 'گردان', 'سپهر', '<middle>', 'فروزنده', 'ماه', 'و', 'ناهید', 'و', 'مهر', '<eos>']
['<sos>', 'نیابد', 'بدو', 'نیز', 'اندیشه', 'راه', '<middle>', 'که', 'او', 'برتر', 'از', 'نام', 'و', 'از', 'جایگاه', '<eos>']
['<sos>', 'خرد', 'را', 'و', 'جان', 'را', 'همی', 'سنجد', 'اوی', '<middle>', 'در', 'اندیشه', 'سخته', 'کی', 'گنجد', 'اوی', '<eos>']
['<sos>', 'سه', 'پاس', 'تو', 'چشم', 'است', 'وگوش', 'و', 'زبان', '<middle>', 'کزین', 'سه', 'رسد', 'نیک', 'و', 'بد', 'بی', 'گمان', '<eos>']
['<sos>', 'نه', 'کند', 'آوری', 'گیرد', 'از', 'باج', 'و', 'گنج', '<middle>', 'نه', 'دل', 'تیره', 'دارد', 'ز', 'رزم', 'و', 'ز', 'رنج', '<eos>']
['<sos>', 'ز', 'گنج', 'و', 'ز', 'تخت', 'و', 'ز', 'در', 'و', 'گهر', '<middle>', 'ز', 'اسپ', 'و', 'سلیح', 'و', 'کلاه', 'و', 'کمر', '<eos>']
['<sos>', 'به', 'دیبا', 'و', 'دینار', 'و', 'در', 'و', 'درم', '<middle>', 'به', 'بوی', 'و', 'به', 'رنگ'

In [13]:
# do padding

input_seq = pad_sequences(tokenized_input, maxlen=max_len, padding='post')
output_seq = pad_sequences(tokenized_output, maxlen=max_len, padding='post')

In [14]:
# Implement multi head self attention as a Keras layer

class MultiHeadSelfAttention(Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        
        self.query_dense = []
        for i in range(num_heads):
            self.query_dense.append(layers.Dense(embed_dim))
        
        self.key_dense = []
        for i in range(num_heads):
            self.key_dense.append(layers.Dense(embed_dim))
        
        self.value_dense = []
        for i in range(num_heads):
            self.value_dense.append(layers.Dense(embed_dim))

        self.combine_heads = layers.Dense(embed_dim)
    
    def self_attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output
    
    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]
        seq_len = tf.shape(inputs)[1]

        query = []
        for i in range(self.num_heads):
            query.append(self.query_dense[i](inputs))
        
        key = []
        for i in range(self.num_heads):
            key.append(self.key_dense[i](inputs))
        
        value = []
        for i in range(self.num_heads):
            value.append(self.value_dense[i](inputs))

        attention = []
        for i in range(self.num_heads):
            attention.append(self.self_attention(query[i], key[i], value[i]))

        attention = tf.concat(attention, 2)
        # (batch_size, seq_len, embed_dim * num_head)

        output = self.combine_heads(
            attention
        )  # (batch_size, seq_len, embed_dim)
        return output

In [23]:
# Implement a Transformer block as a layer

class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = Sequential(
            [Dense(ff_dim, activation="relu"), Dense(embed_dim),]
        )
        # self.layernorm1 = LayerNormalization(epsilon=1e-6)
        # self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output)
        # out1 = self.layernorm1(inputs + attn_output)
        # ffn_output = self.ffn(out1)
        ffn_output = self.ffn(attn_output)
        ffn_output = self.dropout2(ffn_output)
        # return self.layernorm2(out1 + ffn_output)
        return ffn_output

In [16]:
# Implement embedding layer

class TokenAndPositionEmbedding(Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


In [25]:
# building model

def build_model(word_num, maxlen, embed_dim = 64, word_embedding_size = 512, num_heads = 8, feed_forward_dim = 128):
    inputs = Input(shape=(maxlen,), dtype=tf.int32)
    embedding_layer = TokenAndPositionEmbedding(maxlen, word_num, word_embedding_size)
    x = embedding_layer(inputs)
    x = TransformerBlock(embed_dim, num_heads, feed_forward_dim)(x)
    x = Dropout(0.1)(x)
    x = TransformerBlock(embed_dim, num_heads, feed_forward_dim)(x)
    x = Dropout(0.1)(x)
    # x = TransformerBlock(embed_dim, num_heads, feed_forward_dim)(x)
    # x = Dropout(0.1)(x)
    # x = TransformerBlock(embed_dim, num_heads, feed_forward_dim)(x)
    # x = Dropout(0.1)(x)
    # x = TransformerBlock(embed_dim, num_heads, feed_forward_dim)(x)
    # x = Dropout(0.1)(x)
    outputs = Dense(word_num, activation="softmax")(x)
    model = keras.Model(inputs=inputs, outputs=outputs)
    model.compile(
        "adam", loss="sparse_categorical_crossentropy"
    )
    model.summary()
    return model


In [18]:
base_path = "/content/gdrive/My Drive/bsc/Final/Transformers/"
!ls "/content/gdrive/My Drive/bsc/Final/Transformers/"

acc.txt			     cp.ckpt.index
checkpoint		     loss.txt
cp.ckpt.data-00000-of-00002  sample_sentences_during_training.txt
cp.ckpt.data-00001-of-00002


In [19]:
# sample from model

def sample_line(model, type = 0):
    sos = word2index['<sos>']
    eos = word2index['<eos>']
    middle = word2index['<middle>']
    
    _row = np.repeat(0, max_len)
    _row[0] = sos
    np_input = _row.reshape(1,max_len)

    output_sentence = []
    
    
    for i in range(1, max_len):
        o = model.predict(np_input)
        
        probs = np.array(o[0][i])
        probs[0] = 0
        if type == 0:
            probs /= probs.sum()
            idx = np.random.choice(len(probs), p=probs)
        
        elif type == 1:
            idx = np.argmax(probs)
        
        np_input[0, i] = idx
    
    sentence = np_input[0]

    for i in sentence:
        if i == sos:
            continue
        if i == eos:
            break
        if i == 0:
            continue
        if i == middle:
            output_sentence.append("    -   ")
        else:
            output_sentence.append(index2word[i])

    decoded_review = ' '.join(output_sentence)

    return decoded_review

In [20]:
# create custom callback
class MyCustomCallback(tf.keras.callbacks.Callback):
    def __init__(self, initial_epoch = 0):
        self.initial_epoch = initial_epoch

    def on_epoch_end(self, epoch, logs=None):
        print('Training: epoch {} ends at {}'.format(epoch + self.initial_epoch, datetime.datetime.now().time()))
        print('sample text via predict : ')
        sentence = sample_line(self.model, 0)
        print(sentence)
        
        _path = base_path + "sample_sentences_during_training.txt"
        file_object = open(_path, 'a+')
        file_object.write('\n Training: epoch {} ends at {} \n'.format(epoch + self.initial_epoch, datetime.datetime.now().time()))
        file_object.write('sample text via predict : \n')
        file_object.write(sentence)
        file_object.close()
  

In [21]:
# create custom callback to save loss and acc

class MyCustomCallbackLoss(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):    
        loss_path = base_path + "loss.txt"
        # acc_path = base_path + "acc.txt"
        print("saving loss")
        file_object = open(loss_path, 'a+')
        file_object.write('{}\n'.format(logs["loss"]))
        file_object.close()
        # file_object = open(acc_path, 'a+')
        # file_object.write('{}\n'.format(logs["sparse_categorical_accuracy"]))
        # file_object.close()
  

In [26]:
# train model

EPOCH_TIME = 100
WORD_NUM = len(index2word) + 1

# Create a callback that saves the model's weights
checkpoint_path = base_path + "cp.ckpt" 
checkpoint_dir = os.path.dirname(checkpoint_path)
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, verbose=1, save_weights_only=True)

model = build_model(word_num = WORD_NUM, maxlen = max_len)

model.fit(input_seq, output_seq, batch_size=100, epochs = EPOCH_TIME, validation_split= 0.2, shuffle=True, verbose=1,
          callbacks=[
                     cp_callback,
                     MyCustomCallback(),
                     MyCustomCallbackLoss()
                    ]
          )

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 21)]              0         
_________________________________________________________________
token_and_position_embedding (None, 21, 512)           4190208   
_________________________________________________________________
transformer_block_3 (Transfo (None, 21, 64)            837376    
_________________________________________________________________
dropout_10 (Dropout)         (None, 21, 64)            0         
_________________________________________________________________
transformer_block_4 (Transfo (None, 21, 64)            149248    
_________________________________________________________________
dropout_13 (Dropout)         (None, 21, 64)            0         
_________________________________________________________________
dense_136 (Dense)            (None, 21, 8163)          5305

<tensorflow.python.keras.callbacks.History at 0x7f25504be828>