In [None]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
import pickle
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from IPython.display import display
import sympy as sp
import urllib
sp.init_printing(use_latex=True)
import matplotlib.pyplot as plt
import textwrap
#import os
#os.environ['LD_LIBRARY_PATH']='/opt/conda/lib'

In [None]:
import os

In [None]:
class TransformerBlock(keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
        super(TransformerBlock, self).__init__(**kwargs)
        self.att = keras.layers.MultiHeadAttention(num_heads=num_heads,
                                                   key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [keras.layers.Dense(ff_dim, activation="gelu"),
             keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


In [None]:
class GPTransformerBlock(keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
        super(GPTransformerBlock, self).__init__(**kwargs)
        self.att = keras.layers.MultiHeadAttention(num_heads=num_heads,
                                                   key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [keras.layers.Dense(ff_dim, activation="gelu"),
             keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, x, training):
        y = x
        y = self.layernorm1(y, training=training)
        y = self.att(y, y,
                     use_causal_mask = True,
                     training=training)
        y = self.dropout1(y, training=training)
        x += y
        y = x
        y = self.layernorm2(y, training=training)
        y = self.ffn(y, training=training)
        y = self.dropout2(y, training=training)
        return x + y
    # def call(self, inputs, training):
    #     attn_output = self.att(inputs, inputs,
    #                            use_causal_mask = True)
    #     attn_output = self.dropout1(attn_output, training=training)
    #     out1 = self.layernorm1(inputs + attn_output)
    #     ffn_output = self.ffn(out1)
    #     ffn_output = self.dropout2(ffn_output, training=training)
    #     return self.layernorm2(out1 + ffn_output)


In [None]:
# Source pulled from KerasNLP: https://github.com/keras-team/keras-nlp/blob/v0.4.1/keras_nlp/layers/sine_position_encoding.py#L22
class SinePositionEncoding(keras.layers.Layer):
    """Sinusoidal positional encoding layer.
    This layer calculates the position encoding as a mix of sine and cosine
    functions with geometrically increasing wavelengths. Defined and formulized
    in [Attention is All You Need](https://arxiv.org/abs/1706.03762).
    Takes as input an embedded token tensor. The input must have shape
    [batch_size, sequence_length, feature_size]. This layer will return a
    positional encoding the same size as the embedded token tensor, which
    can be added directly to the embedded token tensor.
    This layer optionally accepts `tf.RaggedTensor`s as inputs to process
    batches of sequences of different lengths. The one ragged dimension must be
    the dimension that corresponds to the sequence, that is, the penultimate
    dimension.
    Args:
        max_wavelength: The maximum angular wavelength of the sine/cosine
            curves, as described in Attention is All You Need. Defaults to
            10000.
    Examples:
    ```python
    # create a simple embedding layer with sinusoidal positional encoding
    seq_len = 100
    vocab_size = 1000
    embedding_dim = 32
    inputs = keras.Input((seq_len,), dtype=tf.float32)
    embedding = keras.layers.Embedding(
        input_dim=vocab_size, output_dim=embedding_dim
    )(inputs)
    positional_encoding = keras_nlp.layers.SinePositionEncoding()(embedding)
    outputs = embedding + positional_encoding
    ```
    References:
     - [Vaswani et al., 2017](https://arxiv.org/abs/1706.03762)
    """

    def __init__(
        self,
        max_wavelength=10000,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.max_wavelength = max_wavelength

    def call(self, inputs):
        # TODO(jbischof): replace `hidden_size` with`hidden_dim` for consistency
        # with other layers.
        if isinstance(inputs, tf.RaggedTensor):
            bounding_shape = inputs.bounding_shape()
            position_embeddings = (
                self._compute_trim_and_broadcast_position_embeddings(
                    bounding_shape,
                )
            )
            # then apply row lengths to recreate the same ragged shape as inputs
            return tf.RaggedTensor.from_tensor(
                position_embeddings,
                inputs.nested_row_lengths(),
            )
        else:
            return self._compute_trim_and_broadcast_position_embeddings(
                tf.shape(inputs),
            )

    def _compute_trim_and_broadcast_position_embeddings(self, shape):
        seq_length = shape[-2]
        hidden_size = shape[-1]
        position = tf.cast(tf.range(seq_length), self.compute_dtype)
        min_freq = tf.cast(1 / self.max_wavelength, dtype=self.compute_dtype)
        timescales = tf.pow(
            min_freq,
            tf.cast(2 * (tf.range(hidden_size) // 2), self.compute_dtype)
            / tf.cast(hidden_size, self.compute_dtype),
        )
        angles = tf.expand_dims(position, 1) * tf.expand_dims(timescales, 0)
        # even indices are sine, odd are cosine
        cos_mask = tf.cast(tf.range(hidden_size) % 2, self.compute_dtype)
        sin_mask = 1 - cos_mask
        # embedding shape is [seq_length, hidden_size]
        positional_encodings = (
            tf.sin(angles) * sin_mask + tf.cos(angles) * cos_mask
        )

        return tf.broadcast_to(positional_encodings, shape)

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "max_wavelength": self.max_wavelength,
            }
        )
        return config

In [None]:
class MaskedTokenAndPositionEmbedding(keras.layers.Layer):
    def __init__(self, maxlen, input_dim, output_dim, **kwargs):
        super(MaskedTokenAndPositionEmbedding, self).__init__(**kwargs)
        self.token_emb = keras.layers.Embedding(input_dim=input_dim,
                                                output_dim=output_dim,
                                                mask_zero=True)
        self.pos_emb = keras.layers.Embedding(input_dim=maxlen+1,
                                              output_dim=output_dim,
                                              mask_zero=True)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=1, limit=maxlen+1, delta=1)
        positions = positions * tf.cast(tf.sign(x),tf.int32)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
class MaskedTokenAndSinePositionEmbedding(keras.layers.Layer):
    def __init__(self, input_dim, output_dim, max_wavelength=10000,**kwargs):
        super(MaskedTokenAndSinePositionEmbedding, self).__init__(**kwargs)
        self.token_emb = keras.layers.Embedding(input_dim=input_dim,
                                                output_dim=output_dim,
                                                mask_zero=True)
        self.pos_emb = SinePositionEncoding(max_wavelength=max_wavelength)

    def call(self, x):
        mask = tf.expand_dims(tf.sign(x),-1)
        x = self.token_emb(x)
        positions = self.pos_emb(x)
        positions = positions * mask
        return x + positions

In [None]:
# Custom masked loss/accuracy functions
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def MaskedSparseCategoricalCrossentropy(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

def MaskedSparseCategoricalAccuracy(real, pred):
    accuracies = tf.equal(tf.cast(real,tf.int64), tf.argmax(pred, axis=2))
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    accuracies = tf.math.logical_and(mask, accuracies)
    accuracies = tf.cast(accuracies, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)


In [None]:
def remove_filler(val):
    if val!="<|endoftext|>":
        return True
    else:
        return False
def encode_seq(x,max_length=0):
    # String to integer
    proto_val = tokenizer(x)
    val = tokenizer(x+(''.join(["<|endoftext|>"]*(max_length-len(proto_val['input_ids'])-20))))
    input_ids = val['input_ids']
    return input_ids

def decode_seq(x):
    test=filter(remove_filler,tokenizer.convert_ids_to_tokens(x))
    filler_removed = list(test)
    return tokenizer.convert_tokens_to_string(filler_removed)

In [None]:
segment_size = 20
model_length = 10*segment_size
batch_size = 100

In [None]:
n_tokens = len(tokenizer.get_vocab())
embedding_size = 128
stack = 5
num_heads = 12
memory_size = segment_size * 3

y = x = keras.layers.Input((None,))
y = MaskedTokenAndSinePositionEmbedding(input_dim=n_tokens,
                                        output_dim=embedding_size)(y)
for _ in range(stack):
    y = GPTransformerBlock(embedding_size,
                           num_heads,
                           embedding_size*2)(y)

y = keras.layers.Dense(n_tokens)(y)

model = keras.Model(x,y)
model.compile(loss=MaskedSparseCategoricalCrossentropy,
              optimizer=keras.optimizers.Adam(),
              metrics=MaskedSparseCategoricalAccuracy)
model.summary()
keras.utils.plot_model(model,show_shapes=True,expand_nested=True)

In [None]:
#make file to upload to digitalocean
#tar -xcvf training_1.tar.gz training_1

In [None]:
#download and extract file from digitalocean
#curl -O https://s-stem-data.nyc3.digitaloceanspaces.com/P_D_Weights/training_1.tar.gz
#tar -xzvf training_1.tar.gz

In [None]:
os.system('curl -O https://s-stem-data.nyc3.digitaloceanspaces.com/Austen.txt')

In [None]:
os.system('curl -O https://s-stem-data.nyc3.digitaloceanspaces.com/P_D_Weights/training_1.tar.gz')
os.system('tar -xzvf training_1.tar.gz')

In [None]:
checkpoint_path = "training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

In [None]:
model.load_weights(checkpoint_path)

In [None]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, data, max_length, batch_size=32, **kwargs):
        super(DataGenerator, self).__init__(**kwargs)
        'Initialization'
        self.batch_size = batch_size
        self.data = data # a handle only...
        self.indices = np.arange(self.data[0].shape[0])
        self.max_length = max_length
        self.idx = 0
        self.on_epoch_end()

    def __len__(self):
        'The number of batches per epoch'
        return int(np.floor(self.data[0].shape[0] / self.batch_size))

    def __getitem__(self, index):
        'Generate one (enumerated) batch of data'
        # Generate indices for a batch and grab batch
        indices = self.indices[index*self.batch_size:(index+1)*self.batch_size]
        return self.__data_generation(indices)

    def __data_generation(self, ids):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Generate data
        x = tf.convert_to_tensor(self.data[0][ids],dtype=tf.int32)
        y = tf.convert_to_tensor(self.data[1][ids],dtype=tf.int32)
        return x, y

    def on_epoch_end(self):
        return np.random.shuffle(self.indices)

    ## Needed for TF Dataset conversion...
    def output_signature(self):
        sig = self[0]
        return (tf.TensorSpec.from_tensor(sig[0]),
                tf.TensorSpec.from_tensor(sig[1]))

In [None]:
with open('Austen.txt', 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
text = []
j = 0
for i in range(len(lines)-1):
    if lines[i] == '':
        line = ' '.join(lines[j:i])
        if line != '':
            text = text + [line[k:k+model_length-2] for k in range(0, len(line), model_length-2)]
            # text = text + [line]
        j = i+1

In [None]:
# Subset? - All of the data will take some time...
n_seq = len(text)
n_seq = 1000
split_point = int(n_seq * 0.8)
text = text[:n_seq]
np.random.shuffle(text) # In-place modification
max_length = np.max([len(i) for i in text])+2 # Start+stop
max_length

In [None]:
X = np.vstack([encode_seq(x,max_length + (segment_size - (max_length-1) % segment_size)) for x in text])
training = DataGenerator((X[:split_point,:-1],
                          X[:split_point,1:]),model_length,batch_size)
validation = DataGenerator((X[split_point:,:-1],
                            X[split_point:,1:]),model_length,batch_size)

In [None]:
print('Training Accuracy:',model.evaluate(training)[1]*100.0,'%')
print('Validation Accuracy:',model.evaluate(validation)[1]*100.0,'%')

In [None]:
def select_token(x):
    x = np.cumsum(x)
    return np.argmax(x > np.random.random())

In [None]:
# Turn off teacher forcing...
# Prompt (needs to be at least 1 - the start token?...)
i = 5
data = training[0][0][i:i+1]
prompt = 3
tokens = np.full(data.shape,dtype=np.int32,fill_value=50256)
tokens[0,0:prompt] = data[:,0:prompt]

print("Original:")
print(decode_seq(data[0]))
print()

print("Prompt:")
print(decode_seq(tokens[0]))
print()

print("Decoding:")
print(decode_seq(tokens[0]),end='')
for x in range(prompt,data.shape[1]-1):
    probabilities = keras.activations.softmax(model(tokens)).numpy()[0,x-1]
    # Most likely token...
    result = probabilities.argmax(-1)
    # Sampled token...
    result = np.apply_along_axis(select_token, -1, probabilities)
    tokens[0,x] = result
    if result == 50256:
        break # Stop token found!
    print(decode_seq(tokens[0,x:x+1]),end='')
print()
print()

result = model(tokens).numpy()
print("Remodeled:")
print(decode_seq(result.argmax(-1)[0]))