### SimCLR process used on Text

Process:

- New data generator
- Causal Mask
- Troubleshoot when it breaks

In [21]:
# Setup environment

import os
os.environ["LD_LIBRARY_PATH"]='/opt/conda/lib'
os.environ["XLA_FLAGS"] = "--xla_gpu_cuda_data_dir=/opt/conda/pkgs/cuda-toolkit"

In [22]:
import matplotlib.pyplot as plt
import numpy as np
import numpy.typing as npt
import tensorflow as tf
import tensorflow.keras as keras

In [23]:
# Download Austen Text
# !wget https://www.cs.mtsu.edu/~jphillips/courses/CSCI4850-5850/public/Austen.txt

### Setup Austen Text

In [24]:
segment_size = 20
model_length = 10*segment_size
batch_size = 100

In [25]:
with open('Austen.txt', 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
text = []
j = 0
for i in range(len(lines)-1):
    if lines[i] == '':
        line = ' '.join(lines[j:i])
        if line != '':
            text = text + [line[k:k+model_length-2] for k in range(0, len(line), model_length-2)]
            # text = text + [line]
        j = i+1

In [26]:
len(text)

27423

In [27]:
text[0]

'Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to unite some of the best blessings of existence; and had lived nearly twenty-one years in the world'

### Data Generator
- Create batches of data based on encoded text
- Indexing will pull randomly cropped sections text

In [28]:
#Import GPT2Tokenizer to use

from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [78]:
#Encode / Decode tokens using built-in functions

def encode_seq(x, max_length=0):
    #String to token IDs
    #Using end of text token as padding
    encoded = tokenizer.encode(x, max_length=max_length, truncation=True)
    padding = encoded + [50256 for i in range (max_length - len(encoded))]
    return padding

def decode_seq(x):
    #Token IDs to string
    remove_padding = []
    for i in range(len(x)):
        if x[i] == 50256: #end of text
            break
        remove_padding.append(x[i])
    return tokenizer.decode(remove_padding)

In [30]:
test = text[0]
print(test)

Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to unite some of the best blessings of existence; and had lived nearly twenty-one years in the world


In [31]:
def random_crop(text):
    crop_percent = np.random.uniform(0.4, 1)

    if crop_percent == 1:
        return text
        
    length = len(text)
    index = int(np.floor(length*crop_percent))

    cropped_text = text[index:]
    return cropped_text

cropped_test = random_crop(test)
print(cropped_test)

in the world


In [32]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, data, max_length, batch_size=32, **kwargs):
        super(DataGenerator, self).__init__(**kwargs)
        'Initialization'
        self.batch_size = batch_size
        self.data = data # a handle only...
        self.indices = np.arange(self.data[0].shape[0])
        self.max_length = max_length
        self.idx = 0
        self.on_epoch_end()

    def __len__(self):
        'The number of batches per epoch'
        return int(np.floor(self.data[0].shape[0] / self.batch_size))

    def __getitem__(self, index):
        'Generate one (enumerated) batch of data'
        # Generate indices for a batch and grab batch
        indices = self.indices[index*self.batch_size:(index+1)*self.batch_size]
        return self.__data_generation(indices)

    def __data_generation(self, ids):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Generate data
        x = tf.convert_to_tensor(self.data[0][ids],dtype=tf.int32)
        y = tf.convert_to_tensor(self.data[1][ids],dtype=tf.int32)
        return x, y

    def on_epoch_end(self):
        return np.random.shuffle(self.indices)

    ## Needed for TF Dataset conversion...
    def output_signature(self):
        sig = self[0]
        return (tf.TensorSpec.from_tensor(sig[0]),
                tf.TensorSpec.from_tensor(sig[1]))

In [33]:
# Subset? - All of the data will take some time...
n_seq = len(text)
n_seq = 1000
split_point = int(n_seq * 0.8)
text = text[:n_seq]
np.random.shuffle(text) # In-place modification
max_length = np.max([len(i) for i in text])+2 # Start+stop
max_length

200

In [94]:
# Encode in batches
X = np.vstack([encode_seq(x,max_length + (segment_size - (max_length-1) % segment_size)) for x in text])
training = DataGenerator((X[:split_point,:-1],
                          X[:split_point,1:]),model_length,batch_size)
validation = DataGenerator((X[split_point:,:-1],
                            X[split_point:,1:]),model_length,batch_size)

In [95]:
decode_seq(training[0][0][0])

'he table, took it up, and examined it very attentively. With the view of passing off an awkward moment, Emma smilingly said,'

### Transformer Setup
- Causal Mask
- Feed Forward
- Relative Encodings

In [104]:
# Relative Mutli Head Attention from SirDavidLudwig's DeepDNA
# https://github.com/DLii-Research/deep-dna/tree/master

class RelativeMultiHeadAttention(keras.layers.MultiHeadAttention):
    def __init__(self, max_seq_len=None, **kwargs):
        super().__init__(**kwargs)
        self._max_seq_len = max_seq_len

    def build(self, input_shape):
        if self._max_seq_len is None:
            self._max_seq_len = input_shape[1]
            assert self._max_seq_len is not None, "RelativeMultiHeadAttention requires max_seq_len to be specified."
        self._rel_embeds = self.add_weight("relative_embeddings",
                                           shape=(self._max_seq_len, self._key_dim),
                                           initializer="glorot_uniform", trainable=True)
        return super().build(input_shape)

    def get_config(self):
        config = super().get_config()
        config.update({
            "max_seq_len": self._max_seq_len
        })
        return config

    def _skew(self, QEr):
        padded = tf.pad(QEr, [[0, 0], [0, 0], [0, 0], [1, 0]])
        shape = tf.shape(padded)
        reshaped = tf.reshape(padded, (shape[0], shape[1], shape[3], shape[2]))
        return reshaped[:,:,1:,:]

    def _compute_attention(self, query, key, value, attention_mask=None, training=None):
        # Note: Applying scalar multiply at the smaller end of einsum improves
        # XLA performance, but may introduce slight numeric differences in
        # the Transformer attention head.
        query = tf.multiply(query, 1.0 / np.sqrt(float(self._key_dim)))

        # Compute relative position encodings
        rel_enc = self._skew(tf.einsum("acbd,ed->abce", query, self._rel_embeds))

        # Take the dot product between "query" and "key" to get the raw
        # attention scores.
        attention_scores = tf.einsum(self._dot_product_equation, key, query)

        attention_scores = self._masked_softmax(attention_scores + rel_enc, attention_mask)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_scores_dropout = self._dropout_layer(attention_scores, training=training)

        # `context_layer` = [B, T, N, H]
        attention_output = tf.einsum(self._combine_equation, attention_scores_dropout, value)
        return attention_output, attention_scores


In [105]:
class TransformerBlock(keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, max_seq_len=None):
        super(TransformerBlock, self).__init__()
        self.att = RelativeMultiHeadAttention(max_seq_len=max_seq_len, num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [keras.layers.Dense(ff_dim, activation="gelu"),
             keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def compute_output_shape(self, input_shape):
        return input_shape
        
    def call(self, x, training):
        y = x
        y = self.layernorm1(y, training=training)
        y, scores = self.att(y, y, return_attention_scores=True, training=training, use_causal_mask=True)
        y = self.dropout1(y, training=training)
        x += y
        y = x
        y = self.layernorm2(y, training=training)
        y = self.ffn(y, training=training)
        y = self.dropout2(y, training=training)
        return (x + y, scores)

In [55]:
# Positional encoding for self-attention

# Source pulled from KerasNLP: https://github.com/keras-team/keras-nlp/blob/v0.4.1/keras_nlp/layers/sine_position_encoding.py#L22
class SinePositionEncoding(keras.layers.Layer):
    """Sinusoidal positional encoding layer.
    This layer calculates the position encoding as a mix of sine and cosine
    functions with geometrically increasing wavelengths. Defined and formulized
    in [Attention is All You Need](https://arxiv.org/abs/1706.03762).
    Takes as input an embedded token tensor. The input must have shape
    [batch_size, sequence_length, feature_size]. This layer will return a
    positional encoding the same size as the embedded token tensor, which
    can be added directly to the embedded token tensor.
    This layer optionally accepts `tf.RaggedTensor`s as inputs to process
    batches of sequences of different lengths. The one ragged dimension must be
    the dimension that corresponds to the sequence, that is, the penultimate
    dimension.
    Args:
        max_wavelength: The maximum angular wavelength of the sine/cosine
            curves, as described in Attention is All You Need. Defaults to
            10000.
    Examples:
    ```python
    # create a simple embedding layer with sinusoidal positional encoding
    seq_len = 100
    vocab_size = 1000
    embedding_dim = 32
    inputs = keras.Input((seq_len,), dtype=tf.float32)
    embedding = keras.layers.Embedding(
        input_dim=vocab_size, output_dim=embedding_dim
    )(inputs)
    positional_encoding = keras_nlp.layers.SinePositionEncoding()(embedding)
    outputs = embedding + positional_encoding
    ```
    References:
     - [Vaswani et al., 2017](https://arxiv.org/abs/1706.03762)
    """

    def __init__(
        self,
        max_wavelength=10000,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.max_wavelength = max_wavelength

    def call(self, inputs):
        # TODO(jbischof): replace `hidden_size` with`hidden_dim` for consistency
        # with other layers.
        if isinstance(inputs, tf.RaggedTensor):
            bounding_shape = inputs.bounding_shape()
            position_embeddings = (
                self._compute_trim_and_broadcast_position_embeddings(
                    bounding_shape,
                )
            )
            # then apply row lengths to recreate the same ragged shape as inputs
            return tf.RaggedTensor.from_tensor(
                position_embeddings,
                inputs.nested_row_lengths(),
            )
        else:
            return self._compute_trim_and_broadcast_position_embeddings(
                tf.shape(inputs),
            )

    def _compute_trim_and_broadcast_position_embeddings(self, shape):
        seq_length = shape[-2]
        hidden_size = shape[-1]
        position = tf.cast(tf.range(seq_length), self.compute_dtype)
        min_freq = tf.cast(1 / self.max_wavelength, dtype=self.compute_dtype)
        timescales = tf.pow(
            min_freq,
            tf.cast(2 * (tf.range(hidden_size) // 2), self.compute_dtype)
            / tf.cast(hidden_size, self.compute_dtype),
        )
        angles = tf.expand_dims(position, 1) * tf.expand_dims(timescales, 0)
        # even indices are sine, odd are cosine
        cos_mask = tf.cast(tf.range(hidden_size) % 2, self.compute_dtype)
        sin_mask = 1 - cos_mask
        # embedding shape is [seq_length, hidden_size]
        positional_encodings = (
            tf.sin(angles) * sin_mask + tf.cos(angles) * cos_mask
        )

        return tf.broadcast_to(positional_encodings, shape)

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "max_wavelength": self.max_wavelength,
            }
        )
        return config

In [53]:
# Sine position embeddings, for self-attention
class MaskedTokenAndSinePositionEmbedding(keras.layers.Layer):
    def __init__(self, input_dim, output_dim, max_wavelength=10000,**kwargs):
        super(MaskedTokenAndSinePositionEmbedding, self).__init__(**kwargs)
        self.token_emb = keras.layers.Embedding(input_dim=input_dim,
                                                output_dim=output_dim,
                                                mask_zero=True)
        self.pos_emb = SinePositionEncoding(max_wavelength=max_wavelength)

    def call(self, x):
        mask = tf.expand_dims(tf.sign(x),-1)
        x = self.token_emb(x)
        positions = self.pos_emb(x)
        positions = positions * mask
        return x + positions

In [106]:
embed_dim = 64
kernel = 4
num_heads = 6
ff_dim = 512
stack = 2

# Note the input size (there is only one channel - intensity)
# these images... if you are using color images, your would
# need to set the last dimension of the input_shape to -3-
# above and this would carry over into this cell...
y = x = keras.layers.Input(shape=x_train.shape[1:])
# This layer will just be passed a constant integer for
# embedding (class token - see the ViT paper)
# input_class = keras.layers.Input(shape=(1,))

# Patches
y = keras.layers.Conv2D(embed_dim,
                        kernel_size=(kernel,kernel),
                        strides=(kernel,kernel))(tf.expand_dims(y,-1))
temp = y

# Hybrid CNN
# y = keras.layers.Conv2D(embed_dim*2,
#                         kernel_size=(5, 5),
#                         strides=2,
#                         activation='gelu')(y)
# y = keras.layers.Conv2D(embed_dim,
#                         kernel_size=(5, 5),
#                         strides=2,
#                         activation='gelu')(y)

# Flatten 2D arrangement to 1D arrangement of tokens
y = keras.layers.Reshape((-1,embed_dim))(y)
y = PositionEmbedding(y.shape[-2],embed_dim)(y)

# Create class token
c = keras.layers.Lambda(lambda x: tf.tile(tf.constant([[0]]),(tf.shape(x)[0],1)))(y)
c = keras.layers.Embedding(input_dim=1,output_dim=embed_dim)(c)

# Prepend class token
y = keras.layers.Concatenate(axis=1)([c,y])

for _ in range(stack):
    y = TransformerBlock(embed_dim, num_heads, ff_dim)(y)[0]
# Lambda layer is like the ViT the paper...
# y = keras.layers.Lambda(lambda x: x[:,0,:])(y)
# Use the following -instead- of Lambda is common in other
# models I have seen... probably not important though
y = keras.layers.GlobalAveragePooling1D()(y)
# y = keras.layers.Dropout(0.1)(y)
# y = keras.layers.Dense(embed_dim,activation='gelu')(y)
#    y = keras.layers.Dropout(0.5)(y)
# y = keras.layers.Dense(10)(y)

model = keras.Model(x,y)
# model.compile(optimizer=keras.optimizers.Nadam(learning_rate=0.0001),
#               loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
#               metrics=keras.metrics.SparseCategoricalAccuracy())
# model.summary()
masked_encoder = model
# model = keras.Model(x,temp)
# keras.utils.plot_model(model,show_shapes=True,expand_nested=True)

NameError: name 'x_train' is not defined

In [None]:
# Finalize and compile model
n_tokens = len(tokenizer)
embedding_size = 128
stack = 5
num_heads = 12
memory_size = segment_size * 3

y = x = keras.layers.Input((None,))
y = MaskedTokenAndSinePositionEmbedding(input_dim=n_tokens,
                                        output_dim=embedding_size)(y)
for _ in range(stack):
    y = GPTransformerBlock(embedding_size,
                           num_heads,
                           embedding_size*2,
                           max_seq_len = max_length)(y)

y = keras.layers.Dense(n_tokens)(y)

model = keras.Model(x,y)

### Contrastive Model
- Uses the Transformer as encoder

In [15]:
accuracy = keras.metrics.SparseCategoricalAccuracy()
def ContrastiveAccuracy(y_true, y_pred):
    # return accuracy(y_true[tf.argmax(y_pred,-1)],
    #                 y_pred)
    acc_masked = accuracy(y_true, y_pred)
    acc_unmasked = accuracy(y_true, tf.transpose(y_pred))
    acc = (acc_masked + acc_unmasked) / 2.0
    return acc

In [16]:
class ContrastiveModel(tf.keras.Model):
    def __init__(
        self,
        masked_encoder: tf.keras.Model,
        unmasked_encoder: tf.keras.Model,
        embed_dim: int = 512,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.masked_encoder = masked_encoder
        self.unmasked_encoder = unmasked_encoder

#         self.masked_encoder.trainable = False
#         self.unmasked_encoder.trainable = False

        self.embed_dim = embed_dim
        self.W_masked = tf.keras.layers.Dense(
            self.embed_dim,
            # input_shape=self.masked_encoder.output_shape[-1],
            use_bias=False)
        self.W_unmasked = tf.keras.layers.Dense(
            self.embed_dim,
            # input_shape=self.unmasked_encoder.output_shape[-1],
            use_bias=False)
        self.t = self.add_weight(
            name="Temperature",
            shape=None,
            trainable=True
        )

    def compile(self, *args, **kwargs):
        return super().compile(
            *args,
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            **kwargs)

    def test_step(self, data):
        n = tf.shape(data[0])[0]
        y_true = tf.range(n)
        y_pred = self(data, training=False)
        loss_masked = self.compiled_loss(y_true, y_pred)
        loss_unmasked = self.compiled_loss(y_true, tf.transpose(y_pred))
        loss = (loss_masked + loss_unmasked) / 2.0
        self.compiled_metrics.update_state(y_true, y_pred)
        return {m.name: m.result() for m in self.metrics}
    
    def train_step(self, data):
        n = tf.shape(data[0])[0]
        y_true = tf.range(n)
        with tf.GradientTape() as tape:
            y_pred = self(data, training=True)
            loss_masked = self.compiled_loss(y_true, y_pred)
            loss_unmasked = self.compiled_loss(y_true, tf.transpose(y_pred))
            loss = (loss_masked + loss_unmasked) / 2.0
        grads = tape.gradient(loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.compiled_metrics.update_state(y_true, y_pred)
        return {m.name: m.result() for m in self.metrics}

    def call(self, inputs, training=False):
        # Get the images from input
        masked_images, unmasked_images = inputs[0], inputs[1]

        # Embed them using the encoders
        masked_features = self.masked_encoder(masked_images)
        unmasked_features = self.unmasked_encoder(unmasked_images)

        # Joint multimodal embedding
        masked_embeddings = self.W_masked(masked_features)
        unmasked_embeddings = self.W_unmasked(unmasked_features)

        # Normalize
        masked_embeddings = masked_embeddings / tf.norm(masked_embeddings)
        unmasked_embeddings = unmasked_embeddings / tf.norm(unmasked_embeddings)

        logits = tf.tensordot(masked_embeddings, tf.transpose(unmasked_embeddings), axes=1) * tf.exp(self.t)

        return logits

In [17]:
cm = ContrastiveModel(masked_encoder, unmasked_encoder)
cm.compile(
    optimizer=tf.keras.optimizers.Adam(),
    metrics=[
        ContrastiveAccuracy
    ],
    run_eagerly=False # Set to true to debug
)

NameError: name 'masked_encoder' is not defined