<a href="https://colab.research.google.com/github/n-bzy/iannwtf/blob/main/homework11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow_text
!pip install sentencepiece

In [2]:
import tensorflow as tf
import tensorflow_text as tf_txt
import sentencepiece as sp
import numpy as np
import re
import io
import datetime
import tqdm
import math

# bash code to mount the drive
import os
from google.colab import drive
drive.mount("/content/drive")
os.chdir("/content/drive/MyDrive")

file_path = f"/content/drive/MyDrive/bible.txt"

with open(file_path, "r") as f:
    text = f.read()

print(text[:100])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
The First Book of Moses:  Called Genesis


1:1 In the beginning God created the heaven and the earth


In [3]:
def tokenize(text, voc_size):
    """Preprocess text data by lower case, remove special characters and split 
    words with a SentencePie tokenizer
    input: text,vocabulary size
    output: tokenized text"""

    text = text.lower()
    text = re.sub(r"[^a-z]+", " ", text)

    sp.SentencePieceTrainer.train(
    input=f"/content/drive/MyDrive/bible.txt", model_prefix='tokenizer_model', model_type="unigram", vocab_size=voc_size)

    # deserialize the trained model file to load it in the correct format
    trained_tokenizer_model = tf.io.gfile.GFile('tokenizer_model.model', "rb").read()

    # load the model as a tokenizer that can be used inside a tensorflow model
    tokenizer = tf_txt.SentencepieceTokenizer(
        model=trained_tokenizer_model, out_type=tf.int32, nbest_size=-1, alpha=1, reverse=False,
        add_bos=False, add_eos=False, return_nbest=False, name=None)
    
    tokens = tokenizer.tokenize(text)
    return tokens, tokenizer 

tokens, tokenizer = tokenize(text, voc_size=5000)
print(tokens)

tf.Tensor([   4  273  562 ...   31   19 1670], shape=(949085,), dtype=int32)


In [4]:
def input_target(tokens,seq_length,batch_size):
    #data = tf.data.Dataset.from_tensor_slices(tokens)
    data = tf_txt.sliding_window(tokens, width=seq_length+1)

    #input = data[:seq_length]
    #target = data[1:]

    #data = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input), tf.data.Dataset.from_tensor_slices(target)))
    data = tf.data.Dataset.from_tensor_slices(data)
    data = data.shuffle(1024).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return data

#ds = input_target(tokens, seq_length=128, batch_size=64)
train_ds = input_target(tokens[:math.ceil(len(tokens)*0.9)], seq_length=128, batch_size=128)
test_ds = input_target(tokens[math.ceil(len(tokens)*0.9):], seq_length=128, batch_size=128)

for x in train_ds.take(1):
    print(x.shape)

(128, 129)


In [5]:
class Emb(tf.keras.layers.Layer):
    """"""
    def __init__(self, voc_size, emb_size, seq_length):
        super().__init__()

        self.emb = tf.keras.layers.Embedding(input_dim=voc_size,output_dim=emb_size)
        self.pos = tf.keras.layers.Embedding(input_dim=seq_length,output_dim=emb_size)

    def call(self,input):
        t = tf.range(0,len(input))
        t = tf.expand_dims(t,-1)
        x = self.emb(input)
        y = self.pos(t)
        #print(x.shape, y.shape)
        z = x+y
        return z


In [6]:
class TransformerBlock(tf.keras.layers.Layer):
    """"""
    def __init__(self, emb_size):
        super().__init__()

        self.mha = tf.keras.layers.MultiHeadAttention(num_heads=3,key_dim=emb_size)
        self.denseRELU = tf.keras.layers.Dense(128,activation='relu')
        self.dense = tf.keras.layers.Dense(emb_size)
        self.drop1 = tf.keras.layers.Dropout(0.1)
        self.drop2 = tf.keras.layers.Dropout(0.1)
        self.ln1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.ln2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    def call(self, input, training=False):
        x = self.mha(query=input,value=input, use_causal_mask=True)
        x = self.drop1(x, training=training)
        x = x + input
        x = self.ln1(x)
        z = self.denseRELU(x)
        z = self.dense(z)
        z = self.drop2(z, training=training)
        y = x + z
        y = self.ln2(y)
        return y


In [7]:
class Transformer(tf.keras.Model):
    """"""
    def __init__(self, voc_size, emb_size, seq_length, tokenizer):
        super().__init__()

        self.opt = tf.keras.optimizers.Adam()
        self.loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

        self.metrics_list = [tf.keras.metrics.Mean(name="loss")]
        
        self.emb = Emb(voc_size, emb_size, seq_length)
        self.tfb = TransformerBlock(emb_size)
        self.dense = tf.keras.layers.Dense(voc_size)
        
        self.tokenizer = tokenizer
        self.seq_length = seq_length

    def call(self, input,  training=False):
        x = self.emb(input)
        x = self.tfb(x, training=training)
        x = self.dense(x)
        return x

    def reset_metrics(self):
        for metric in self.metrics:
            metric.reset_states()
            
    @tf.function
    def train_step(self, data):
        
        x = data[:,:self.seq_length]
        t = data[:,1:]
        
        with tf.GradientTape() as tape:
            predictions = self(x, training=True)
            loss = self.loss(t, predictions) + tf.reduce_sum(self.losses)
        
        gradients = tape.gradient(loss, self.trainable_variables)
        self.opt.apply_gradients(zip(gradients, self.trainable_variables))
        
        # update loss metric
        self.metrics[0].update_state(loss)

        # Return a dictionary mapping metric names to current value
        return {m.name: m.result() for m in self.metrics}

    @tf.function
    def test_step(self, data):

        x = data[:,:self.seq_length]
        t = data[:,1:]

        predictions = self(x, training=False)
        loss = self.loss(t, predictions) + tf.reduce_sum(self.losses)

        self.metrics[0].update_state(loss)

        return {m.name: m.result() for m in self.metrics}
    
    def generate_text(self, prompt, output_length, top_k):
        tokens = self.tokenizer.tokenize(prompt)
        for _ in range(output_length):
            #x = tf.expand_dims(tokens, 0)

            y = self(x, training=False)
            highest_logits = tf.math.top_k(y, k = top_k, sorted = True)
            sample_number = tf.random.uniform(shape=(), minval=0, maxval=top_k, dtype=tf.int32)
            vocabulary_index = highest_logits.indices.numpy()[0, -1, sample_number]
            tokens = tf.concat([tokens, [vocabulary_index]], -1)
        
        return self.tokenizer.detokenize(tokens)

In [8]:
model = Transformer(voc_size=5000, emb_size=128, seq_length=128, tokenizer=tokenizer)

# run model on input once so the layers are built
model(tf.keras.Input((129)));
model.summary()

Model: "transformer"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 emb (Emb)                   multiple                  656384    
                                                                 
 transformer_block (Transfor  multiple                 231424    
 merBlock)                                                       
                                                                 
 dense_2 (Dense)             multiple                  645000    
                                                                 
Total params: 1,532,810
Trainable params: 1,532,808
Non-trainable params: 2
_________________________________________________________________


In [10]:
import tqdm

def training_loop(model, train_ds, val_ds, epochs):#, train_summary_writer, val_summary_writer):
    for epoch in range(epochs):
        print(f"Epoch {epoch}:")
        
        for data in tqdm.tqdm(train_ds, position=0, leave=True):
            metrics = model.train_step(data)
            
            # logging the validation metrics to the log file which is used by tensorboard
            #with train_summary_writer.as_default():
                #for metric in model.metrics:
                    #tf.summary.scalar(f"{metric.name}", metric.result(), step=epoch)

        # print the metrics
        print([f"{key}: {value.numpy()}" for (key, value) in metrics.items()])

        # reset all metrics (requires a reset_metrics method in the model)
        model.reset_metrics()    
        
        # Validation:
        for data in val_ds:
            metrics = model.test_step(data)
        
            # logging the validation metrics to the log file which is used by tensorboard
            #with val_summary_writer.as_default():
                #for metric in model.metrics:
                    #tf.summary.scalar(f"{metric.name}", metric.result(), step=epoch)
                    
        print([f"val_{key}: {value.numpy()}" for (key, value) in metrics.items()])

        # reset all metrics
        model.reset_metrics()
        #print("\n")

        gen_text = model.generate_text('What is', 5, 5)
        tf.print(gen_text)

training_loop(model,train_ds, test_ds, epochs=10,)

Epoch 0:


100%|██████████| 6673/6673 [06:58<00:00, 15.94it/s]


['loss: 3.542757987976074']
['val_loss: 5.03262186050415']
tf.Tensor(b'What is given ordained seen been ordained', shape=(), dtype=string)
Epoch 1:


100%|██████████| 6673/6673 [06:54<00:00, 16.10it/s]


['loss: 3.465855360031128']
['val_loss: 5.095674991607666']
tf.Tensor(b'What is ordained been done been ordained', shape=(), dtype=string)
Epoch 2:


100%|██████████| 6673/6673 [06:54<00:00, 16.11it/s]


['loss: 3.4033737182617188']
['val_loss: 5.146303176879883']
tf.Tensor(b'What is made done been done ordained', shape=(), dtype=string)
Epoch 3:


100%|██████████| 6673/6673 [06:52<00:00, 16.19it/s]


['loss: 3.3606479167938232']
['val_loss: 5.211519241333008']
tf.Tensor(b'What is made spoken been been made', shape=(), dtype=string)
Epoch 4:


100%|██████████| 6673/6673 [06:52<00:00, 16.20it/s]


['loss: 3.33132266998291']
['val_loss: 5.261597633361816']
tf.Tensor(b'What is made seen made ordained made', shape=(), dtype=string)
Epoch 5:


100%|██████████| 6673/6673 [06:51<00:00, 16.23it/s]


['loss: 3.3094565868377686']
['val_loss: 5.309690952301025']
tf.Tensor(b'What is been ordained made ordained been', shape=(), dtype=string)
Epoch 6:


100%|██████████| 6673/6673 [06:51<00:00, 16.22it/s]


['loss: 3.292238473892212']
['val_loss: 5.371089458465576']
tf.Tensor(b'What is ordained been said made ordained', shape=(), dtype=string)
Epoch 7:


100%|██████████| 6673/6673 [06:51<00:00, 16.23it/s]


['loss: 3.2793240547180176']
['val_loss: 5.397314548492432']
tf.Tensor(b'What is been made made made made', shape=(), dtype=string)
Epoch 8:


100%|██████████| 6673/6673 [06:51<00:00, 16.21it/s]


['loss: 3.269660711288452']
['val_loss: 5.410665035247803']
tf.Tensor(b'What is ordained done been made been', shape=(), dtype=string)
Epoch 9:


100%|██████████| 6673/6673 [07:21<00:00, 15.10it/s]


['loss: 3.2612290382385254']
['val_loss: 5.424385070800781']
tf.Tensor(b'What is raised been made said made', shape=(), dtype=string)
