In [11]:
from tqdm import tqdm
import datetime
import tensorflow as tf
import tensorflow_text as tft
import numpy as np
import sentencepiece as sp


vocab_size = 4096
corpus_file = 'capital.txt'

sp.SentencePieceTrainer.train(input=corpus_file, model_prefix='tokenizer_model', 
                              model_type="unigram", vocab_size=vocab_size)
trained_tokenizer_model = tf.io.gfile.GFile('tokenizer_model.model', "rb").read()
tokenizer = tft.SentencepieceTokenizer(model=trained_tokenizer_model, out_type=tf.int32, 
                                          nbest_size=-1, alpha=1, reverse=False, add_bos=False, 
                                          add_eos=False, return_nbest=False, name=None)
tokens = tokenizer.tokenize("thou shall not do a capitalism")
print(tokens)

tf.Tensor([  16  134  965  535   30  194   11   14 1587], shape=(9,), dtype=int32)


sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: capital.txt
  input_format: 
  model_prefix: tokenizer_model
  model_type: UNIGRAM
  vocab_size: 4096
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential_privacy_noise_level: 0
  differential_privacy

In [12]:
window_size = 33#65


with open(corpus_file, 'r') as file:
    corpus = file.read()

corpus_tokens = tokenizer.tokenize(corpus)
windows = tft.sliding_window(corpus_tokens, window_size)

In [13]:
dataset = tf.data.Dataset.from_tensor_slices(windows)

def preprocessing(data):
    data = data.map(lambda x: (x[:-1], x[1:]))
    data = data.shuffle(42)
    data = data.batch(16)
    #data = data.cache()
    data = data.prefetch(20)
    return data

dataset = dataset.apply(preprocessing)
next(iter(dataset))

(<tf.Tensor: shape=(16, 32), dtype=int32, numpy=
 array([[1028,   57, 1006,    5, 1631,    5,  644, 1406,  149,   74, 2804,
            9, 1116,   82,  911,  149,   74, 2804, 1709,  149,   74,   24,
         3008, 1883,  167,    8,    5,   11, 3416,   74, 3993,   13],
        [  94,   82, 3397, 1419,    5, 1653, 1509, 2364,  600,   39,   80,
         1378,  879,   74,   24, 1028,   57, 1006,    5, 1631,    5,  644,
         1406,  149,   74, 2804,    9, 1116,   82,  911,  149,   74],
        [ 911,  149,   74, 2804, 1709,  149,   74,   24, 3008, 1883,  167,
            8,    5,   11, 3416,   74, 3993,   13, 2570,    9, 1918,   48,
          481,  443, 1332,  299,  504,    5, 1918,    9,    3,  107],
        [1509, 2364,  600,   39,   80, 1378,  879,   74,   24, 1028,   57,
         1006,    5, 1631,    5,  644, 1406,  149,   74, 2804,    9, 1116,
           82,  911,  149,   74, 2804, 1709,  149,   74,   24, 3008],
        [ 644, 1406,  149,   74, 2804,    9, 1116,   82,  911,  149,   

In [4]:
class Model(tf.keras.Model):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.is_setup = False
        self.accuracy_metric = tf.keras.metrics.CategoricalAccuracy(name="acc")
        self.loss_metric = tf.keras.metrics.Mean(name="loss")
        self.prev_epochs = 0
    
    
    def reset_metrics(self):
        for metric in self.metrics:
            metric.reset_state()
    
    
    def setup(self):
        if self.is_setup:
            return
        # DEFINE PATHS
        current_time = datetime.datetime.now().strftime("%Y.%m.%d-%H:%M:%S")
        path = f"logs/{current_time}"
        # CONSTRUCT WRITERS
        self.writer_train = tf.summary.create_file_writer(path)
        self.is_setup = True
    
    
    @tf.function
    def call(self, x, training=False):
        for layer in self.layers:
            x = layer(x, training=training)
        return x
    
    
    @tf.function
    def step(self, X, T, training=True):
        if training:
            with tf.GradientTape() as tape:
                Y = self(X, training=training)
                L = self.loss(T, Y)
            gradient = tape.gradient(L, self.trainable_weights)
            self.optimizer.apply_gradients(zip(gradient, self.trainable_weights))
        else:
            Y = self(X, training)
            L = self.loss(T, Y)
        self.loss_metric.update_state(L)
        #self.accuracy_metric.update_state(T, Y)
        return {metric.name: float(metric.result()) for metric in self.metrics}
    
    
    def train(self, dataset, epochs):
        training_metrics = {metric.name: [] for metric in self.metrics}
        self.setup()
        with self.writer_train.as_default():
            tf.summary.text('text', self.generate_text('The', output_length=16, top_k=64), step=0)
        with tqdm(epochs) as bar:
            for epoch in range(epochs):
                # TRAINING
                bar.set_description('TRAINING')
                for X, T  in dataset:
                    metrics = self.step(X, T, training=True)
                    for name, value in metrics.items():
                        training_metrics[name].append(value)
                # WRITING METRICS
                with self.writer_train.as_default():
                    for metric in self.metrics:
                        tf.summary.scalar(metric.name, metric.result(), step=self.prev_epochs)
                    tf.summary.text('text', self.generate_text('The', output_length=16, top_k=64), step=self.prev_epochs)
                self.reset_metrics()
                bar.update(1)
                self.prev_epochs += 1
            metrics = {metric.name: training_metrics[metric.name] for metric in self.metrics}
            return metrics

In [17]:
class Attention(tf.keras.layers.Layer):
    def __init__(self, n_units, drop_rate=0.1, mask=True, **kwargs):
        super().__init__(**kwargs)
        self.n_units = n_units
        self.mask = mask
        self.drop = tf.keras.layers.Dropout(drop_rate)
        self.W_Q  = tf.keras.layers.Dense(self.n_units, use_bias=False, 
                                          kernel_regularizer=tf.keras.regularizers.L2())
        self.W_K  = tf.keras.layers.Dense(self.n_units, use_bias=False, 
                                          kernel_regularizer=tf.keras.regularizers.L2())
        self.W_V  = tf.keras.layers.Dense(self.n_units, use_bias=False, 
                                          kernel_regularizer=tf.keras.regularizers.L2())
    
    
    @tf.function
    def call(self, X, C=None, training=True): # X: Values, C: Context
        C = X if C is None else C
        dk = X.shape[1]
        Q = self.W_Q(C)
        K = self.W_K(C)
        V = self.W_V(X)
        A = tf.einsum('bik, bjk -> bij', Q, K) / tf.math.sqrt(tf.cast(dk, tf.float32))
        if self.mask:
            mask = tf.constant([[[1] * (i + 1) + [0] * (dk - i - 1) for i in range(dk)]], dtype=tf.float32)
            A = A * mask
        A = tf.nn.softmax(A, axis=2)
        A = self.drop(A, training=training)
        return tf.einsum('bij, bjk -> bik', A, V)



class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, n_units, n_heads, drop_rate=0.1, **kwargs):
        super().__init__(**kwargs)
        self.n_units = n_units
        self.n_heads = n_heads
        self.linear = tf.keras.layers.Dense(n_units)
        self.heads = [Attention(n_units, drop_rate) for _ in range(n_heads)]
    
    
    @tf.function
    def call(self, X, C=None, training=True): # X: Values, C: Context
        C = X if C is None else C
        Y = tf.concat([head(X, C, training=training) for head in self.heads], axis=2)
        Z = self.linear(Y)
        return Z



class Block(tf.keras.layers.Layer):
    def __init__(self, n_units, n_heads, drop_rate=0.1, **kwargs):
        super().__init__(**kwargs)
        #self.multihead  = MultiHeadAttention(n_units, n_heads, drop_rate)
        self.multihead  = tf.keras.layers.MultiHeadAttention(num_heads=n_heads, key_dim=n_units, dropout=drop_rate)
        self.norm_one   = tf.keras.layers.BatchNormalization()
        self.norm_two   = tf.keras.layers.BatchNormalization()
        self.affine_one = tf.keras.layers.Dense(n_units * 4)
        self.affine_two = tf.keras.layers.Dense(n_units)
        self.drop_one   = tf.keras.layers.Dropout(drop_rate)
        self.drop_two   = tf.keras.layers.Dropout(drop_rate)
        self.drop_three = tf.keras.layers.Dropout(drop_rate)
    
    
    @tf.function
    def call(self, X, training=False):
        Z = self.norm_one(X,   training=training)
        Z = self.multihead(Z, Z, Z, use_causal_mask=True, training=training)
        #Z = self.multihead(Z,  training=training)
        Z = self.drop_two(Z,   training=training)
        Z = X + Z
        X = self.norm_one(Z,   training=training)
        Z = self.affine_one(X)
        Z = tf.nn.gelu(Z)
        Z = self.affine_two(Z)
        Z = self.drop_three(Z, training=training)
        return X + Z



class Embedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embed_size, **kwargs):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.word_embed = tf.keras.layers.Embedding(vocab_size, embed_size)
        self.pos_embed  = tf.keras.layers.Embedding(vocab_size, embed_size)
    
    
    @tf.function
    def call(self, X):
        word_embed = self.word_embed(X)
        pos_embed  = self.pos_embed(tf.range(X.shape[1]))
        return word_embed + pos_embed
        
        

class GPT(Model):
    def __init__(self, tokenizer, n_vocab, n_units, n_layers, n_heads, drop_rate=0.1, **kwargs):
        super().__init__(**kwargs)
        self.tokenizer = tokenizer
        self.embedding = Embedding(n_vocab, n_units)
        self.drop = tf.keras.layers.Dropout(drop_rate)
        self.blocks = [Block(n_units, n_heads, drop_rate) for _ in range(n_layers)]
        self.norm = tf.keras.layers.BatchNormalization()
        self.affine = tf.keras.layers.Dense(n_vocab)
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
        self.loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    
    
    @tf.function
    def call(self, X, training=False):
        X = self.embedding(X)
        X = self.drop(X, training=training)
        for block in self.blocks:
            X = block(X, training=training)
        X = self.norm(X, training=training)
        X = self.affine(X)
        if not training:
            X = tf.nn.softmax(X, axis=2)
        return X
    
    
    def generate_text(self, prompt: str, output_length: int, top_k: int):
        tokens = tokenizer.tokenize(prompt).numpy().tolist()
        for _ in range(output_length):
            X = tf.constant(tokens, dtype=tf.int32)[None,...]
            logits = self(X)[0,-1,:]
            top_val, top_idx = tf.math.top_k(logits, k=top_k)
            idx = tf.random.categorical(top_val[None, ...], 1)[0, 0]
            token = top_idx[idx].numpy()
            tokens.append(token)
        return self.tokenizer.detokenize(tokens)



model = GPT(tokenizer, vocab_size, n_units=256, n_layers=8, n_heads=4)
#model.load_weights('GPT_V1')
model.generate_text('The', output_length=12, top_k=5)

TypeError: Exception encountered when calling layer "gpt_1" "                 f"(type GPT).

in user code:

    File "/tmp/ipykernel_140399/3207563206.py", line 114, in call  *
        X = block(X, training=training)
    File "/home/mortimer/.local/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "/tmp/__autograph_generated_file3nupcjvh.py", line 11, in tf__call
        Z = ag__.converted_call(ag__.ld(self).multihead, (ag__.ld(Z),), dict(training=ag__.ld(training)), fscope)

    TypeError: Exception encountered when calling layer "block_8" "                 f"(type Block).
    
    in user code:
    
        File "/tmp/ipykernel_140399/3207563206.py", line 67, in call  *
            Z = self.multihead(Z,  training=training)
        File "/home/mortimer/.local/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
        File "/home/mortimer/.local/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
            return fn(*args, **kwargs)
    
        TypeError: MultiHeadAttention.call() missing 1 required positional argument: 'value'
    
    
    Call arguments received by layer "block_8" "                 f"(type Block):
      • X=tf.Tensor(shape=(1, 1, 256), dtype=float32)
      • training=False


Call arguments received by layer "gpt_1" "                 f"(type GPT):
  • X=tf.Tensor(shape=(1, 1), dtype=int32)
  • training=False

In [None]:
#import os
#os.system('export XLA_FLAGS=–xla_gpu_cuda_data_dir=/opt/cuda')
#model.optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)

In [14]:
history = model.train(dataset, 10)

TRAINING: : 10it [4:33:29, 1640.92s/it]


In [16]:
X, Y = next(iter(dataset))
#print(model(X)[0, 32, :])
model.generate_text('The mode of capital ', output_length=16, top_k=32)

<tf.Tensor: shape=(), dtype=string, numpy=b'The mode of capital ins by a this and and  by and value as a it'>

In [None]:
tokenizer.detokenize(tokenizer.tokenize('I bims der Marx!'))

In [None]:
tokenizer.tokenize('a b c d')

In [None]:
tokenizer.detokenize([ 11, 462, 209, 754])

In [None]:
model.save_weights('GPT_V3')