<a href="https://colab.research.google.com/github/n-bzy/iannwtf/blob/main/homework_11_notworking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentencepiece

In [None]:
!pip install -U tensorflow-text

In [13]:
import tensorflow as tf
import tensorflow_text as tf_text
from tensorflow.keras.layers import Layer
from tensorflow.keras import Model
import sentencepiece as sp
from google.colab import drive
import os
import io
import re
import datetime
import tqdm

In [None]:
#bash code to mount the drive
drive.mount("/content/drive")
os.chdir("drive/MyDrive")

In [5]:
#load the text file to which the model is fitted as a string
with open("bible.txt", "r") as f:
  text = f.read()

# Hyperparameters

In [6]:
WINDOW_SIZE = 120 #try between 32-256
VOCAB_SIZE = 2000 #try between 2000-7000
EMBEDDING_DIM = 64 #try between 64-256
BATCH_SIZE = 32

# Data Preprocessing
CLEANING
- convert to lower case
- remove all characters that aren't letters or spaces

In [7]:
#convert to lower case + replace all characters that aren't letters or spaces with a space
#we replace by a space to avoid forming new words by accident, e.g. end.Start -> endStart
text = re.sub("[^a-z ]+"," ",text.lower()) 
#we remove possible accumulation of spaces 
text = re.sub(" +", " ", text)

TOKENIZATION
- train sentencePiece tokenizer on text
- split data into sub-word tokens by applying the trained tokenizer

In [8]:
#train sentencepiece tokenizer on text data
sp.SentencePieceTrainer.train(input="bible.txt", model_prefix='tokenizer_model', model_type="unigram", vocab_size=VOCAB_SIZE)

In [9]:
# deserialize the trained model file to load it in the correct format
trained_tokenizer_model = tf.io.gfile.GFile('tokenizer_model.model', "rb").read()

# load the model as a tokenizer that can be used inside a tensorflow model
tokenizer = tf_text.SentencepieceTokenizer(
    model=trained_tokenizer_model, out_type=tf.int32, nbest_size=-1, alpha=1, reverse=False,
    add_bos=False, add_eos=False, return_nbest=False, name=None
)

In [10]:
def preprocessing_pipeline(text):
    """Data preprocessing that takes the tokenized text and a window_size as inputs"""

    #tokenize text data with trained Sentence Piece tokenizer
    tokens = tokenizer.tokenize(text)
    # create sliding window inputs of length m+1 
    data = tf_text.sliding_window(data=tokens, width=WINDOW_SIZE+1, axis=0)
    #create a tensorflow dataset
    data = tf.data.Dataset.from_tensor_slices(data)
    #create inputs and targets, where targets are input sequence shifted by 1 element
    data = data.map(lambda data: (data[:WINDOW_SIZE],data[1:]) )
    # cache the dataset
    data = data.cache()
    #shuffle, batch, prefetch
    data = data.shuffle(1000).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    
    return data

In [11]:
#NOTE: We only train and validate on a smaller part of the text (original length: 4 016 014)
#create train and validation data sets
train_ds = preprocessing_pipeline(text[:1000000])
val_ds = preprocessing_pipeline(text[1000000:1500000])

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


# The model

In [14]:
class EmbeddingBlock(Layer):
    """Embedding layer that embeds the individual token indices + their position in the input"""

    def __init__(self):
        """Constructor contains 2 embedding layers, one for token indices, one for token positions"""
        super(EmbeddingBlock, self).__init__()
        self.idx_embedding = tf.keras.layers.Embedding(input_dim = VOCAB_SIZE, output_dim = EMBEDDING_DIM)
        self.pos_embedding = tf.keras.layers.Embedding(input_dim = WINDOW_SIZE, output_dim = EMBEDDING_DIM)
        

    def call(self, i_seq):
        """Forward step"""
        #construct a tensor, where its elements are used as indices to look up the positional code for each sub-word
        tensor = tf.range(0, WINDOW_SIZE)
        #feed the token index embedding layer with the input sequence
        e1 = self.idx_embedding(i_seq)
        #feed the positional embedding layer with the  range tensor
        e2 = self.pos_embedding(tensor)
        #add the two embeddings
        sum = e1 + e2
        return sum

In [15]:
class TransformerBlock(Layer):
    """ """
    def __init__(self, num_heads):
        """Contructor that works with 2-4 attention heads"""
        super(TransformerBlock, self).__init__()
        self.num_heads = num_heads
        self.mha = tf.keras.layers.MultiHeadAttention(num_heads= self.num_heads, key_dim = EMBEDDING_DIM)
        self.dense1 = tf.keras.layers.Dense(units = 256, activation = tf.nn.relu)
        self.dense2 = tf.keras.layers.Dense(units = EMBEDDING_DIM)
        self.dropOut1 = tf.keras.layers.Dropout(rate = 0.1)
        self.dropOut2 = tf.keras.layers.Dropout(rate = 0.1)
        self.norm1 = tf.keras.layers.LayerNormalization(epsilon = 0.000001)
        self.norm2 = tf.keras.layers.LayerNormalization(epsilon = 0.000001)

    def call(self, input):
        """Forward step"""
        #give input to MHA-layer as both value and query arguments
        #causal mask is True such that model does not attend to future tokens
        mha_out = self.mha(input, input,use_causal_mask=True) 
        #use dropout on the output of MHA layer 
        drop_out = self.dropOut1(mha_out,training=True)
        #add result to layer input
        drop_out += input
        #apply layer normalization
        ln_out = self.norm1(drop_out)

        #use normalized output for another residual connection
        x = self.dense1(ln_out)
        x = self.dense2(x)
        x = self.dropOut2(x,training=True)
        x += ln_out
        x = self.norm2(x)       

        return x
     

In [22]:
def MyModel(Model):
  """Model contains methods for initialization, calling, metric reset, trainstep, and text generation"""

  def __init__(self, tokenizer, num_heads=2):
      """Initialization method sets up all parameters that will be used by other methodsMy
        - tokenizer: sentence piece tokenizer to output text, not just token IDs
        - optimizer: Adam with a learning rate of 0.001
        - loss_function: SparseCategoricalCrossentropy -> targets aren't one-hot encoded, but indices
        - metrics: Mean Loss, Categorical Accuracy, Top K Categorical Accuracy
      """
      super(Model, self).__init__()
      self.tokenizer = tokenizer 
      self.num_heads = num_heads
      self.optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
      self.loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
      
      self.metrics_list = [
                      tf.keras.metrics.Mean(name="loss"),
                      tf.keras.metrics.CategoricalAccuracy(name="acc"),
                      tf.keras.metrics.TopKCategoricalAccuracy(3,name="top-3-acc") 
                      ]
      
      self.layerList = [
                        EmbeddingBlock(),
                        TransformerBlock(self.num_heads),
                        #tf.keras.layers.GlobalAveragePooling1D(),
                        tf.keras.layers.Dense(units=VOCAB_SIZE)
                        ]
    
  def call(self, x):
      """Forward step through all layers"""
      for layer in self.layerList.layers:
          x = layer(x)
      return x
  
  def reset_metrics(self):
      for metric in self.metrics:
        metric.reset_states()
  
  @tf.function
  def train_step(self, data):
      x, targets = data
      with tf.GradientTape() as tape:
          predictions = self(x, training=True)
          loss = self.loss_function(targets, predictions) + tf.reduce_sum(self.losses)
      gradients = tape.gradient(loss, self.trainable_variables)
      self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
      # update loss metric
      self.metrics[0].update_state(loss)
      # for all metrics except loss, update states (accuracy etc.)
      for metric in self.metrics[1:]:
          metric.update_state(targets,predictions)
      # Return a dictionary mapping metric names to current value
      return {m.name: m.result() for m in self.metrics}

  def generate_text(self, prompt, length, top_k=100):
      """Method 
        - prompt: the text (string)
        - length: the desired output length 
        - top_k: specifies the amount of most likely (sub-)words we want to sample from
        - returns a continuation of the input prompt of a specified length"""
      
      #tokenize prompt
      prompt = tokenizer.tokenize(prompt)

      #generate next token of current prompt until requested output length is reached
      while len(prompt) <= length:
        #add batch dimension
        prompt = tf.expand_dims(prompt, axis=0)
        #create padded prompt
        paddings = tf.constant([[0, 0, ], [WINDOW_SIZE-len(prompt), 0 ]])
        pad_prompt = tf.pad(prompt, paddings, mode="CONSTANT", constant_values=0)
        #obtain the logits from the model by calling it on the padded prompt
        #logits = unnormalized scores for likelihood of each token in vocabulary to be next
        logits = self.call(pad_prompt)

        #apply top_k to find the k most likely next tokens based on their logit scores
        top_k_logits, indices = tf.math.top_k(logits, top_k, sorted=True)
        #indices = tf.cast(indices, tf.float32)
        #sample next token from top_k tokens
        next_token = tf.random.categorical(indices, num_samples=1)
        #add new token to prompt
        prompt = tf.concat((prompt,next_token), axis=1)
        #truncate length of input by cutting of beginning of prompt
        prompt = prompt[-len(prompt)-1:]

      #use tokenizer to detokenize the result
      out = tokenizer.detokenize(prompt)

      return out

# Training

CREATE TENSORBOARD

In [17]:
#clean all the logs
!rm -rf ./logs/

In [18]:
# load tensorboard extension
%load_ext tensorboard

# Define where to save the log
config_name = "Homework11"
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

#we create a train and validation writer
train_log_path = f"logs/{config_name}/{current_time}/train"
train_summary_writer = tf.summary.create_file_writer(train_log_path)
#val_log_path = f"logs/{config_name}/{current_time}/val"
#val_summary_writer = tf.summary.create_file_writer(val_log_path)

TRAINING LOOP

In [24]:
num_heads = 2 #normally 2-4
starting_prompt = "What is"
length = 30 #length of generated text

#instantiate model
model = MyModel(tokenizer)

NOTE: Test_step for validation dataset is not implemented in model yet (necessary?!)

In [25]:
#we will train 100 epochs (recommended are 100-600 epochs)
for epoch in range(100):

    #TRAINING DATASET
    for data in train_ds:
        metrics = model.train_step(data)

        with train_summary_writer.as_default():
            #insert metrics into tensorboard log of current epoch
            for metric in model.metrics:
                tf.summary.scalar(metric.name, metric.result(), step=epoch)
            #insert generated text into tensorboard log of current epoch
            generated_text = model.generate_text(starting_prompt, length)
            tf.summary.text("generated_text", generated_text, step = epoch)
    
    #print generated text of each epoch
    print("Epoch: " + str(epoch))
    print(model.generate_text(starting_prompt, length))
    print()
    
    # reset all metrics (requires a reset_metrics method in the model)
    model.reset_metrics() 

    """
    #VALIDATION DATASET
    for data in val_ds:
        metrics = model.test_step(data)

        with val_summary_writer.as_default():
              for metric in model.metrics:
                  tf.summary.scalar(metric.name,metric.result(),step=epoch)

    # reset all metrics
    model.reset_metrics()
    """

AttributeError: ignored

VISUALIZATION

In [None]:
%tensorboard --logdir logs/