<a href="https://colab.research.google.com/github/n-bzy/iannwtf/blob/main/homework_11_model_error.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentencepiece
!pip install tensorflow_text

In [2]:
import tensorflow as tf
import tensorflow_text as tf_text
from tensorflow.keras.layers import Layer
from tensorflow.keras import Model
import sentencepiece as sp
import math
from google.colab import drive
import os
import io
import re
import datetime
import tqdm

In [None]:
#bash code to mount the drive
drive.mount("/content/drive")
os.chdir("drive/MyDrive")

In [4]:
#load the text file to which the model is fitted as a string
with open("bible.txt", "r") as f:
  text = f.read()

In [5]:
print(text[:100])

The First Book of Moses:  Called Genesis


1:1 In the beginning God created the heaven and the earth


# Hyperparameters

In [6]:
WINDOW_SIZE = 128 #sequence length, try between 32-256
VOCAB_SIZE = 2000 #try between 2000-7000
EMBEDDING_DIM = 64 #try between 64-256
BATCH_SIZE = 128

# Data Preprocessing


In [7]:
def tokenize(text):
    #data cleaning: convert to lower case + remove all characters that aren't letters or spaces
    text = re.sub("[^a-z]+"," ",text.lower()) 
    #train sentence-piece tokenizer on text data
    sp.SentencePieceTrainer.train(input="bible.txt", model_prefix='tokenizer_model', model_type="unigram", vocab_size=VOCAB_SIZE)
    #deserialize the trained model file to load it in the correct format
    trained_tokenizer_model = tf.io.gfile.GFile('tokenizer_model.model', "rb").read()
    #load the model as a tokenizer that can be used inside a tensorflow model
    tokenizer = tf_text.SentencepieceTokenizer(
        model=trained_tokenizer_model, out_type=tf.int32, nbest_size=-1, alpha=1, reverse=False,
        add_bos=False, add_eos=False, return_nbest=False, name=None)
    #tokenize text data with trained Sentence Piece tokenizer
    tokens = tokenizer.tokenize(text)
    return tokens, tokenizer

In [8]:
tokens, tokenizer = tokenize(text)

In [9]:
print(tokens)

tf.Tensor([  4 345 739 ...  35 196 122], shape=(1056835,), dtype=int32)


In [10]:
def preprocess(tokens):
    """Data preprocessing on the tokenized text"""

    # create sliding window inputs of length m+1 
    data = tf_text.sliding_window(data=tokens, width=WINDOW_SIZE+1)

    """
    #create input seq and target seq for each token window
    inputs = []
    targets = []
    for n in range(len(data)):
        inputs.append(data[n][:WINDOW_SIZE])
        targets.append(data[n][-WINDOW_SIZE:])
    """

    #create a tensorflow dataset
    data = tf.data.Dataset.from_tensor_slices(data)
    # cache the dataset
    data = data.cache()
    #shuffle, batch, prefetch
    data = data.shuffle(1000).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    
    return data

In [11]:
#create train and validation data sets
train_ds = preprocess(tokens[:math.ceil(len(tokens)*0.9)])
val_ds = preprocess(tokens[math.ceil(len(tokens)*0.9):])

for x in train_ds.take(1):
  print(x.shape)

(128, 129)


# The model

In [12]:
class EmbeddingBlock(Layer):
    """Embedding layer that embeds the individual token indices + their position in the input"""

    def __init__(self):
        """Constructor contains 2 embedding layers, one for token indices, one for token positions"""
        super().__init__()
        self.idx_embedding = tf.keras.layers.Embedding(input_dim = VOCAB_SIZE, output_dim = EMBEDDING_DIM)
        self.pos_embedding = tf.keras.layers.Embedding(input_dim = WINDOW_SIZE, output_dim = EMBEDDING_DIM)
        

    def call(self, input):
        """Forward step"""
        #construct a tensor, where its elements are used as indices to look up the positional code for each sub-word
        tensor = tf.range(0, len(input))
        tensor = tf.expand_dims(tensor,-1)
        #feed the token index embedding layer with the input sequence
        e1 = self.idx_embedding(input)
        #feed the positional embedding layer with the  range tensor
        e2 = self.pos_embedding(tensor)
        #add the two embeddings
        sum = e1 + e2
        return sum

In [13]:
class TransformerBlock(Layer):
    """ """
    def __init__(self):
        """Contructor that works with 2-4 attention heads"""
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(num_heads = 3, key_dim = EMBEDDING_DIM)
        self.dense1 = tf.keras.layers.Dense(units = 128, activation = tf.nn.relu)
        self.dense2 = tf.keras.layers.Dense(units = EMBEDDING_DIM)
        self.dropOut1 = tf.keras.layers.Dropout(rate = 0.1)
        self.dropOut2 = tf.keras.layers.Dropout(rate = 0.1)
        self.norm1 = tf.keras.layers.LayerNormalization(epsilon = 0.000001)
        self.norm2 = tf.keras.layers.LayerNormalization(epsilon = 0.000001)

    def call(self, input, training=False):
        """Forward step"""
        #give input to MHA-layer as both value and query arguments
        #causal mask is True such that model does not attend to future tokens
        mha_out = self.mha(query=input, value=input, use_causal_mask=True) 
        #use dropout on the output of MHA layer 
        drop_out = self.dropOut1(mha_out,training=training)
        #add result to layer input
        drop_out += input
        #apply layer normalization
        ln_out = self.norm1(drop_out)

        #use normalized output for another residual connection
        x = self.dense1(ln_out)
        x = self.dense2(x)
        x = self.dropOut2(x,training=training)
        x += ln_out
        x = self.norm2(x)       

        return x
     

In [14]:
def Transformer(Model):
  """Model contains methods for initialization, calling, metric reset, trainstep, and text generation"""

  def __init__(self, tokenizer):
      """Initialization method sets up all parameters that will be used by other methodsMy
        - tokenizer: sentence piece tokenizer to output text, not just token IDs
        - optimizer: Adam with a learning rate of 0.001
        - loss_function: SparseCategoricalCrossentropy -> targets aren't one-hot encoded, but indices
        - metrics: Mean Loss, Categorical Accuracy, Top K Categorical Accuracy
      """
      super().__init__()
      self.tokenizer = tokenizer 
      self.optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
      self.loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
      
      self.metrics_list = [
                      tf.keras.metrics.Mean(name="loss"),
                      tf.keras.metrics.CategoricalAccuracy(name="acc"),
                      tf.keras.metrics.TopKCategoricalAccuracy(3,name="top-3-acc") 
                      ]
      
      self.layerList = [
                        EmbeddingBlock(),
                        TransformerBlock(),
                        tf.keras.layers.Dense(units=VOCAB_SIZE)
                        ]
    
  def call(self, x):
      """Forward step through all layers"""
      for layer in self.layerList.layers:
          x = layer(x)
      return x
  
  def reset_metrics(self):
      for metric in self.metrics:
        metric.reset_states()
  
  @tf.function
  def train_step(self, data):
      #split data into input and target sequences
      x = data[:,:WINDOW_SIZE]
      targets = data[:,1:]
      #tape loss and prediction
      with tf.GradientTape() as tape:
          predictions = self(x, training=True)
          loss = self.loss_function(targets, predictions) + tf.reduce_sum(self.losses)
      gradients = tape.gradient(loss, self.trainable_variables)
      self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
      #update loss metric
      self.metrics[0].update_state(loss)
      #update the two accuracy metrics
      for metric in self.metrics[1:]:
          metric.update_state(targets,predictions)
      #return a dictionary mapping metric names to current value
      return {m.name: m.result() for m in self.metrics}

  @tf.function
  def test_step(self, data):
      #split data into input and target sequences
      x = data[:,:WINDOW_SIZE]
      targets = data[:,1:]
      #get loss and prediction
      predictions = self(x, training=False)
      loss = self.loss_function(targets, predictions) + tf.reduce_sum(self.losses)
      #update loss metric
      self.metrics[0].update_state(loss)
      #update the two accuracy metrics
      for metric in self.metrics[1:]:
          metric.update_state(targets,predictions)
      #return a dictionary mapping metric names to current value
      return {m.name: m.result() for m in self.metrics}
  
  def generate_text(self, prompt, length, top_k=100):
      """Method 
        - prompt: the text (string)
        - length: the desired output length 
        - top_k: specifies the amount of most likely (sub-)words we want to sample from
        - returns a continuation of the input prompt of a specified length"""
      
      #tokenize prompt
      prompt = self.tokenizer.tokenize(prompt)

      #generate next token of current prompt until requested output length is reached
      while len(prompt) <= length:
        #add batch dimension
        prompt = tf.expand_dims(prompt, axis=0)
        #create padded prompt
        paddings = tf.constant([[0, 0, ], [WINDOW_SIZE-len(prompt), 0 ]])
        pad_prompt = tf.pad(prompt, paddings, mode="CONSTANT", constant_values=0)
        #obtain the logits from the model by calling it on the padded prompt
        #logits = unnormalized scores for likelihood of each token in vocabulary to be next
        logits = self.call(pad_prompt)

        #apply top_k to find the k most likely next tokens based on their logit scores
        top_k_logits, indices = tf.math.top_k(logits, top_k, sorted=True)
        #indices = tf.cast(indices, tf.float32)
        #sample next token from top_k tokens
        next_token = tf.random.categorical(indices, num_samples=1)
        #add new token to prompt
        prompt = tf.concat((prompt,next_token), axis=1)
        #truncate length of input by cutting of beginning of prompt
        prompt = prompt[-len(prompt)-1:]
  
      #use tokenizer to detokenize the result
      out = self.tokenizer.detokenize(prompt)

      return out

# Training

CREATE TENSORBOARD

In [15]:
#clean all the logs
!rm -rf ./logs/

In [16]:
# load tensorboard extension
%load_ext tensorboard

# Define where to save the log
config_name = "Homework11"
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

#we create a train and validation writer
train_log_path = f"logs/{config_name}/{current_time}/train"
train_summary_writer = tf.summary.create_file_writer(train_log_path)
#val_log_path = f"logs/{config_name}/{current_time}/val"
#val_summary_writer = tf.summary.create_file_writer(val_log_path)

TRAINING LOOP

In [17]:
num_heads = 2 #normally 2-4
starting_prompt = "What is"
length = 30 #length of generated text

#instantiate model
model = Transformer(tokenizer=tokenizer)

#run model on input once so the layers are built
model(tf.keras.Input((129)))
model.summary()

TypeError: ignored

In [None]:
#we will train 100 epochs (recommended are 100-600 epochs)
for epoch in range(100):

    #TRAINING DATASET
    for data in train_ds:
        metrics = model.train_step(data)

        with train_summary_writer.as_default():
            #insert metrics into tensorboard log of current epoch
            for metric in model.metrics:
                tf.summary.scalar(metric.name, metric.result(), step=epoch)
            #insert generated text into tensorboard log of current epoch
            generated_text = model.generate_text(starting_prompt, length)
            tf.summary.text("generated_text", generated_text, step = epoch)
    
    #print generated text of each epoch
    print("Epoch: " + str(epoch))
    print(model.generate_text(starting_prompt, length))
    # reset all metrics (requires a reset_metrics method in the model)
    model.reset_metrics() 

    """
    #VALIDATION DATASET
    for data in val_ds:
        metrics = model.test_step(data)

        with val_summary_writer.as_default():
              for metric in model.metrics:
                  tf.summary.scalar(metric.name,metric.result(),step=epoch)

    # reset all metrics
    model.reset_metrics()
    """

AttributeError: ignored

VISUALIZATION

In [None]:
%tensorboard --logdir logs/