# Importing Libraries
**For more examples of what Kosh can do visit [GitHub Examples](https://github.com/LLNL/kosh/tree/stable/examples).**

In [None]:
from numbers import Number
from collections import defaultdict

import matplotlib.pyplot as plt

import kosh
import math
import statistics
import numpy as np
import os
import sys

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import tensorflow as tf
# import horovod.tensorflow as hvd

print(sys.argv[1])
if "-f" in sys.argv[1]:  # Running as notebook
    out_path = 'ball-bounce-transformer_20250114-142359'
    use_gpu = False
    %matplotlib inline
else:
    out_path = sys.argv[1]  # Running as script
    use_gpu = True

# Ensembles Initialization
database = os.path.join(out_path, 'ensembles_output.sqlite')
print(database)
datastore = kosh.connect(database)
print("Kosh is ready!")

# Printing Attributes and Features
test_rec = list(datastore.find())[1]
print('Attributes:')
print('\t',test_rec.list_attributes())
print('\n')
print('Features Sets:')
print('\t',test_rec.list_features())
time=test_rec['physics_cycle_series/time'][:]
image_path = os.path.join(out_path, 'transformer-ball-bounce/images')
os.makedirs(image_path, exist_ok=True)

# Loading Data

Create a Rank 3 Tensor for each of 60% Training, 20% Validation, and 20% Test Data.

(# of Datasets, # of Time Steps per Dataset, # of Features per Time Step)

In [None]:
# Initialize Arrays
X_train=np.array([])
X_val=np.array([])
X_test=np.array([])

num_datasets = len(list(datastore.find(load_type='dictionary'))) - 1 # Subtract the 'mean' dataset from the other notebook
train_datasets = round(num_datasets*.6)
val_datasets = round(num_datasets*.2)
test_datasets = round(num_datasets*.2)    
print(f'Total Datasets: {num_datasets}')
print('Number of train, val, and test datasets:', train_datasets, val_datasets, test_datasets)

for i, dataset in enumerate(datastore.find(load_type='dictionary')): # Each record is now a dataset

        print(f"----------------------Dataset #{i}: ID: {dataset['id']}----------------------")
        if dataset['id']=='mean':
            continue

        x_pos = dataset['curve_sets']['physics_cycle_series']['dependent']['x_pos']['value'][:]
        y_pos = dataset['curve_sets']['physics_cycle_series']['dependent']['y_pos']['value'][:]
        z_pos = dataset['curve_sets']['physics_cycle_series']['dependent']['z_pos']['value'][:]

        # Current dataset matrix for features
        X = pd.DataFrame([x_pos, y_pos, z_pos]).transpose()
        X.columns=['x_pos','y_pos','z_pos']

        # Concatenating the tensor
        if i<train_datasets:
            if X_train.size==0:
                X_train = np.array([X.values])
            else:
                X_train = np.vstack((X_train,[X.values])) 
        elif i<train_datasets+val_datasets:
            if X_val.size==0:
                X_val = np.array([X.values ])
            else:
                X_val =  np.vstack((X_val,[X.values]))
        elif i<train_datasets+val_datasets+test_datasets:
            if X_test.size==0:
                X_test = np.array([X.values ])
            else:
                X_test =  np.vstack((X_test,[X.values]))   

# Plotting for acyclical data
fig, ax = plt.subplots(nrows=3,sharex=True)
fig.suptitle('Example of Train, Val, and Test split')

ax[0].plot(time,X_train[0,:,0], label='Train')
ax[0].plot(time,X_val[0,:,0], label='Validation')
ax[0].plot(time,X_test[0,:,0], label='Test')
ax[0].legend(fontsize='xx-small')
ax[0].set_title('x_pos')

ax[1].plot(time ,X_train[0,:,1], label='Train')
ax[1].plot(time,X_val[0,:,1], label='Validation')
ax[1].plot(time,X_test[0,:,1], label='Test')
ax[1].legend(fontsize='xx-small')
ax[1].set_title('y_pos')

ax[2].plot(time ,X_train[0,:,2], label='Train')
ax[2].plot(time,X_val[0,:,2], label='Validation')
ax[2].plot(time,X_test[0,:,2], label='Test')
ax[2].legend(fontsize='xx-small')
ax[2].set_title('z_pos')

fig.tight_layout()
fig.savefig(os.path.join(image_path, 'example_split.png'))

# Scaling the data
Scaling the data so that all the features are around the same magnitude helps the model converge faster due to how the optimizers update the weights.

In [None]:
num_time_steps, num_features = X.shape
print('Each whole dataset:',X.shape)
print('\tNumber of Time Steps in Each whole dataset:', num_time_steps) 
print('\tNumber of Features per Time Step:', num_features) 
print('\n')

# Scaler
scaler = MinMaxScaler()  # StandardScaler()

##############
# Train Data #
##############
num_datasets, num_time_steps, num_features = X_train.shape
print('X_train:',X_train.shape)
print('\tNumber of Datasets:', num_datasets) 
print('\tNumber of Time Steps per Dataset:', num_time_steps) 
print('\tNumber of Features per Time Step:', num_features) 
print('\n')

# Reshape each feature for all datasets into one long feature for scaling
X_train = np.reshape(X_train, newshape=(-1, num_features))
X_train = scaler.fit_transform(X_train) # Fit AND transform only for train data

# Reshape each long feature back into their own dataset
X_train_scaled = np.reshape(X_train, newshape=(num_datasets, num_time_steps, num_features))

###################
# Validation Data #
###################
num_datasets, num_time_steps, num_features = X_val.shape
print('X_val:',X_val.shape)
print('\tNumber of Datasets:', num_datasets) 
print('\tNumber of Time Steps per Dataset:', num_time_steps) 
print('\tNumber of Features per Time Step:', num_features) 
print('\n')

# Reshape each feature for all datasets into one long feature for scaling
X_val = np.reshape(X_val, newshape=(-1, num_features))
X_val = scaler.transform(X_val)  # Transform ONLY for validation data

# Reshape each long feature back into their own dataset
X_val_scaled = np.reshape(X_val, newshape=(num_datasets, num_time_steps, num_features))


#############
# Test Data #
#############
num_datasets, num_time_steps, num_features = X_test.shape
print('X_test:',X_test.shape)
print('\tNumber of Datasets:', num_datasets) 
print('\tNumber of Time Steps per Dataset:', num_time_steps) 
print('\tNumber of Features per Time Step:', num_features) 
print('\n')

# Reshape each feature for all datasets into one long feature for scaling
X_test = np.reshape(X_test, newshape=(-1, num_features))
X_test = scaler.transform(X_test)  # Transform ONLY for test data

# Reshape each long feature back into their own dataset
X_test_scaled = np.reshape(X_test, newshape=(num_datasets, num_time_steps, num_features))

# Turning numeric data into strings

We need to convert our numeric data to strings so that the transformer can process it. Each dataset consists of three features `'x'`, `'y'`, and `'z'` in a matrix of size `n_timesteps x n_features`.

$$
X_{train_{dataset=0}} = 
\begin{bmatrix}
    [xpos_{t=0} & ypos_{t=0} & zpos_{t=0}] \\
    [xpos_{t=1} & ypos_{t=1} & zpos_{t=1}] \\
    [xpos_{t=2} & ypos_{t=2} & zpos_{t=2}] \\
    [\vdots & \vdots & \vdots] \\
    [xpos_{t=end} & ypos_{t=end} & zpos_{t=end}] \\
\end{bmatrix}
$$

We first transpose each dataset so that each feature becomes its own "sentence" which results in a new matrix of size `n_features x n_timesteps`.

$$
X_{train\_string_{dataset=0}} = 
\begin{bmatrix}
    [xpos_{t=0} & xpos_{t=1} & xpos_{t=2} & \ldots & xpos_{t=end}] \\
    [ypos_{t=0} & ypos_{t=1} & ypos_{t=2} & \ldots & ypos_{t=end}] \\
    [zpos_{t=0} & zpos_{t=1} & zpos_{t=2} & \ldots & zpos_{t=end}] \\
\end{bmatrix}
$$

We then append each of those sentences for each dataset into a list containing the sentences for ALL the datasets.

$$
X_{train\_string} =  [ [xpos_\_dataset_0{_{t=0}} \ldots xpos_\_dataset_0{_{t=end}}], 
[ypos_\_dataset_0{_{t=0}} \ldots ypos_\_dataset_0{_{t=end}}],
[zpos_\_dataset_0{_{t=0}} \ldots zpos_\_dataset_0{_{t=end}}],
\ldots, 
[xpos_\_dataset_{end}{_{t=0}} \ldots xpos_\_dataset_{end}{_{t=end}}],
[ypos_\_dataset_{end}{_{t=0}} \ldots ypos_\_dataset_{end}{_{t=end}}],
[zpos_\_dataset_{end}{_{t=0}} \ldots zpos_\_dataset_{end}{_{t=end}}]]
$$

We pass that list of all sentences for all datasets into `tf.keras.layers.TextVectorization()` to create a vocabulary list from the available "words" and convert the "sentences" to the vocabulary list indices.
$$
X_{train\_string\_vocab} = [ [56 \ldots 75] , [113 \ldots 32], [3455 \ldots 675], \ldots, [546 \ldots 22] , [1123 \ldots 132], [35 \ldots 675] ]
$$

However, now that each number is a "word" the number of signficant figures causes the same number to be different words (e.g 1.23 vs 1.230). Also, when we pass in the training data, the vectorizer would never see a number or "word" that has different significant figures but is very close in value (e.g. 5.32456 vs 5.3245600001). One approach is to create a vocabulary based on a given range of numbers and significant figures. Below we define our significant figures `sig_figs` (to adjust our data) and timestep `dt` (to create our vocabulary).

In [None]:
# Use Tensorflow's TextVectorization() method
vectorize_layer = tf.keras.layers.TextVectorization(standardize=None,
                                                    output_mode='int')
sig_figs = 3
dt = 10**-sig_figs

def features_to_sentences(X_temp_scaled):
    X_temp_string = []
    # Cycle through datasets and convert each of their x y z time series into its own sentence
    for dataset in X_temp_scaled:
        # Transpose so each x y z time series is their own sentence
        dataset_reshape = dataset.transpose()
        for data in dataset_reshape:
            # Append sentences to large list of sentences
            # No longer need to separate into x y z since Transformer predicts sentences not features
            X_temp_string.append(" ".join(f"{x:.{sig_figs}f}" for x in data))
    return X_temp_string

X_train_string = features_to_sentences(X_train_scaled)
X_val_string = features_to_sentences(X_val_scaled)
X_test_string = features_to_sentences(X_test_scaled)

# See data
print(f"Original Data: {X_train_scaled.shape}\n", X_train_scaled[0])  # datasets x timesteps per feature x features per datasets
print(f"Sentence Data: {len(X_train_string)}\n", X_train_string[0])  # datasets x features per datasets = sentences

# Create vocabulary from available data
# vectorize_layer.adapt(X_train_string)
arange_vocab = " ".join(f"{x:.{sig_figs}f}" for x in np.arange(-2, 2+dt, dt))  # just in case out of range values
arange_vocab += f" {0:.{sig_figs}f}"  # due to negative and string conversion range: 0 becomes -0
vectorize_layer.adapt(arange_vocab)

# Print vocabulary
VOCAB_SIZE = len(vectorize_layer.get_vocabulary())
print("\n\nTotal Number of unique 'words':", VOCAB_SIZE)
print("Vocabulary:\n", vectorize_layer.get_vocabulary())

# Convert x y z sentences into vocabulary indices
X_train_vectorized = vectorize_layer(X_train_string)
X_val_vectorized = vectorize_layer(X_val_string)
X_test_vectorized = vectorize_layer(X_test_string)
print("Sentence Data in Vocabulary Index Form:\n", X_train_vectorized)


# Batch and Shuffle Data

We create individual sentence examples the size of `MAX_TOKENS` as we don't need the whole time series as one sentence. This also allows us to create more examples from each time series.

We then batch them in groups of `BATCH_SIZE` so we can train in that size accordingly.

Each example within each sentence has a corresponding translation label/target that is shifted over by one so the transformer can predict the next "word" in the sentence. The input original language and input translation language are the same since we aren't "translating" languages. The output translated langauge is the one that is shifted over by one.

* `X_train_examples.append(sentence[i:MAX_TOKENS+i])`
* `X_train_labels.append(sentence[i+1:MAX_TOKENS+i+1])`

We also shuffle the sentence examples to randomize the training data batches.

In [None]:
# Sub sentences from each sentence
import random
MAX_TOKENS = 128
BATCH_SIZE = 16  # Must evenly divide into total examples or else Transformer model won't work

def create_examples(X_temp_vectorized):
    X_temp_examples = []
    X_temp_labels = []
    for sentence in X_temp_vectorized:

        examples_per_sentence = len(sentence) - MAX_TOKENS
        for i in range(examples_per_sentence):
            # Input original language and input translation language are the same
            X_temp_examples.append(sentence[i:MAX_TOKENS+i])
            # Output translation language is shifted over by one
            X_temp_labels.append(sentence[i+1:MAX_TOKENS+i+1])

    return X_temp_examples, X_temp_labels

X_train_examples, X_train_labels = create_examples(X_train_vectorized)
X_val_examples, X_val_labels = create_examples(X_val_vectorized)
X_test_examples, X_test_labels = create_examples(X_test_vectorized)

print("Total Train examples: ", len(X_train_examples))
print("Total Train batches: ", len(X_train_examples)/BATCH_SIZE)  # double check that these evenly divide
print("Total Validation examples: ", len(X_val_examples))
print("Total Validation batches: ", len(X_val_examples)/BATCH_SIZE)  # double check that these evenly divide
print("Total Test examples: ", len(X_test_examples))
print("Total Test batches: ", len(X_test_examples)/BATCH_SIZE)  # double check that these evenly divide

def shuffle_and_batch(X_temp_examples, X_temp_labels):
    # Shuffle them
    zipped = list(zip(X_temp_examples, X_temp_labels))
    random.shuffle(zipped)
    X_temp_examples, X_temp_labels = zip(*zipped)

    # Batch them
    j = 0
    example_batch = []
    label_batch = []
    X_temp_examples_batches = []
    X_temp_labels_batches = []
    for example, label in zip(X_temp_examples, X_temp_labels):
        example_batch.append(example)
        label_batch.append(label)
        j+=1
        if j == BATCH_SIZE:
            X_temp_examples_batches.append(example_batch)
            X_temp_labels_batches.append(label_batch)
            example_batch = []
            label_batch = []
            j = 0

    # Our input original "language" and input translated "language" are the same
    return X_temp_examples_batches, X_temp_examples_batches, X_temp_labels_batches


X_train_examples_batches_2, X_train_examples_batches, X_train_labels_batches = shuffle_and_batch(X_train_examples, X_train_labels)
X_val_examples_batches_2, X_val_examples_batches, X_val_labels_batches = shuffle_and_batch(X_val_examples, X_val_labels)
X_test_examples_batches_2, X_test_examples_batches, X_test_labels_batches = shuffle_and_batch(X_test_examples, X_test_labels)

print("First Train Batch Length:", len(X_train_examples_batches[0]))
print("Last Train Batch Length:", len(X_train_examples_batches[-1]))
print("First Validation Batch Length:", len(X_val_examples_batches[0]))
print("Last Validation Batch Length:", len(X_val_examples_batches[-1]))
print("First Test Batch Length:", len(X_test_examples_batches[0]))
print("Last Test Batch Length:", len(X_test_examples_batches[-1]))

print("First Train Batch First Example:", X_train_examples_batches[0][0])
print("First Train Batch First Example First 10 Tokens:", X_train_examples_batches[0][0][:10])




# Create Transformer

The following code cells are taken from the [Tensorflow Transformer Tutorial](https://www.tensorflow.org/text/tutorials/transformer).

[The embedding and positional encoding layer](https://www.tensorflow.org/text/tutorials/transformer#the_embedding_and_positional_encoding_layer)

In [None]:
###########################################
# Embedding and Positional Encoding Layer #
###########################################
def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1) 

  return tf.cast(pos_encoding, dtype=tf.float32)

class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True) 
    self.pos_encoding = positional_encoding(length=2048, depth=d_model)

  def compute_mask(self, *args, **kwargs):
    return self.embedding.compute_mask(*args, **kwargs)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)
    # This factor sets the relative scale of the embedding and positonal_encoding.
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x



[The base attention layer](https://www.tensorflow.org/text/tutorials/transformer#the_base_attention_layer)

In [None]:
########################
# Base Attention Layer #
########################

class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

[The cross attention layer](https://www.tensorflow.org/text/tutorials/transformer#the_cross_attention_layer)

In [None]:
#########################
# Cross Attention Layer #
#########################

class CrossAttention(BaseAttention):
  def call(self, x, context):
    attn_output, attn_scores = self.mha(
        query=x,
        key=context,
        value=context,
        return_attention_scores=True)

    # Cache the attention scores for plotting later.
    self.last_attn_scores = attn_scores

    x = self.add([x, attn_output])
    x = self.layernorm(x)

    return x

[The global self-attention layer](https://www.tensorflow.org/text/tutorials/transformer#the_global_self-attention_layer)

In [None]:
##########################
# Global Attention Layer #
##########################

class GlobalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

[The causal self-attention layer](https://www.tensorflow.org/text/tutorials/transformer#the_causal_self-attention_layer)

In [None]:
##########################
# Causal Attention Layer #
##########################

class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask = True)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

 [The feed forward network](https://www.tensorflow.org/text/tutorials/transformer#the_feed_forward_network)

In [None]:
######################
# Feed Forward Layer #
######################

class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x) 
    return x

[The encoder layer](https://www.tensorflow.org/text/tutorials/transformer#the_encoder_layer)

In [None]:
#################
# Encoder Layer #
#################

class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
    super().__init__()

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x

[The encoder](https://www.tensorflow.org/text/tutorials/transformer#the_encoder)

In [None]:
###############
# THE ENCODER #
###############

class Encoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads,
               dff, vocab_size, dropout_rate=0.1):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(
        vocab_size=vocab_size, d_model=d_model)

    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):
    # `x` is token-IDs shape: (batch, seq_len)
    x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.

    # Add dropout.
    x = self.dropout(x)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x)

    return x  # Shape `(batch_size, seq_len, d_model)`.

[The decoder layer](https://www.tensorflow.org/text/tutorials/transformer#the_decoder_layer)

In [None]:
#################
# Decoder Layer #
#################

class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,
               *,
               d_model,
               num_heads,
               dff,
               dropout_rate=0.1):
    super(DecoderLayer, self).__init__()

    self.causal_self_attention = CausalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.cross_attention = CrossAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x, context):
    x = self.causal_self_attention(x=x)
    x = self.cross_attention(x=x, context=context)

    # Cache the last attention scores for plotting later
    self.last_attn_scores = self.cross_attention.last_attn_scores

    x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
    return x

[The decoder](https://www.tensorflow.org/text/tutorials/transformer#the_decoder)

In [None]:
###############
# THE DECODER #
###############

class Decoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size,
               dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                             d_model=d_model)
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for _ in range(num_layers)]

    self.last_attn_scores = None

  def call(self, x, context):
    # `x` is token-IDs shape (batch, target_seq_len)
    x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

    x = self.dropout(x)

    for i in range(self.num_layers):
      x  = self.dec_layers[i](x, context)

    self.last_attn_scores = self.dec_layers[-1].last_attn_scores

    # The shape of x is (batch_size, target_seq_len, d_model).
    return x

[The Transformer](https://www.tensorflow.org/text/tutorials/transformer#the_transformer)

In [None]:
###################
# THE TRANSFORMER #
###################

class Transformer(tf.keras.Model):
  def __init__(self, *, num_layers, d_model, num_heads, dff,
               input_vocab_size, target_vocab_size, dropout_rate=0.1):
    super().__init__()
    self.encoder = Encoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=input_vocab_size,
                           dropout_rate=dropout_rate)

    self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=target_vocab_size,
                           dropout_rate=dropout_rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inputs):
    # To use a Keras model with `.fit` you must pass all your inputs in the
    # first argument.
    context, x  = inputs

    context = self.encoder(context)  # (batch_size, context_len, d_model)

    x = self.decoder(x, context)  # (batch_size, target_len, d_model)

    # Final linear layer output.
    logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)

    try:
      # Drop the keras mask, so it doesn't scale the losses/metrics.
      # b/250038731
      del logits._keras_mask
    except AttributeError:
      pass

    # Return the final output and the attention weights.
    return logits

# Setting it up

Now that the Transfromer and its layers have been created we can start setting it up. The only parameters that were changed from the original tutorial were the `input_vocabe_size` and `target_vocab_size` as we used a different tokenizer: `tf.keras.layers.TextVectorization()`.

[Hyperparameters](https://www.tensorflow.org/text/tutorials/transformer#hyperparameters)

In [None]:
#############
# Set it up #
#############

num_layers = 4  # The number of encoder and decoder layers
d_model = 128  # The Positional Embedding depth (how many "linguistic" features to create per word)
dff = 512  # How many nodes the feed forward network has per layer
num_heads = 8  # The number of self-attention heads
dropout_rate = 0.1  # Dropout rate in feed forward network

transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=VOCAB_SIZE+1,
    target_vocab_size=VOCAB_SIZE+1,
    dropout_rate=dropout_rate)

# Training Set Up

See [Training](https://www.tensorflow.org/text/tutorials/transformer#training) for more information.

We also included [Horovod](https://horovod.ai/) which is a library that allows distributed training of datasets for multiple deep learning libraries. A user can elect to use it by uncommenting the `# GPU Utilization #` section. Some more examples are [here](https://github.com/horovod/horovod/tree/master/examples).

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)


learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

# Loss and metrics 
def masked_loss(label, pred):
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss


def masked_accuracy(label, pred):
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  match = label == pred

  mask = label != 0

  match = match & mask

  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)

verbose = 1
callbacks = []

###################
# GPU Utilization #
###################
# hvd.init()

# if use_gpu:
#     device = 'GPU'
#     gpus = tf.config.experimental.list_physical_devices('GPU')
#     for gpu in gpus:
#         tf.config.experimental.set_memory_growth(gpu, True)
#     if gpus:
#         tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
# else:
#     device = 'CPU'
#     os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

# optimizer = hvd.DistributedOptimizer(optimizer)

# callbacks = [
#     # Horovod: broadcast initial variable states from rank 0 to all other processes.
#     # This is necessary to ensure consistent initialization of all workers when
#     # training is started with random weights or restored from a checkpoint.
#     hvd.callbacks.BroadcastGlobalVariablesCallback(0),

#     # Horovod: average metrics among workers at the end of every epoch.
#     #
#     # Note: This callback must be in the list before the ReduceLROnPlateau,
#     # TensorBoard or other metrics-based callbacks.
#     hvd.callbacks.MetricAverageCallback(),
# ]


# # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
# if hvd.rank() == 0:
#     callbacks.append(tf.keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))

# # Horovod: write logs on worker 0.
# verbose = 1 if hvd.rank() == 0 else 0

# print('Number of %ss: %d' % (device, hvd.size()))

###########
# Compile #
###########
transformer.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy])

# Train The Model

We will now convert our Train, Validation, and Test data matrices into the [Tensorflow Dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) object format so that it is compatible with the Transformer.

In [None]:
train_batches = tf.data.Dataset.from_tensor_slices(((X_train_examples_batches, X_train_examples_batches_2), X_train_labels_batches))
val_batches = tf.data.Dataset.from_tensor_slices(((X_val_examples_batches, X_val_examples_batches_2), X_val_labels_batches))
test_batches = tf.data.Dataset.from_tensor_slices(((X_test_examples_batches, X_test_examples_batches_2), X_test_labels_batches))

# print(X_train_examples_batches)
# print(train_batches)

###################
# Train the model #
###################

history = transformer.fit(train_batches,
                          epochs=20,
                          validation_data=val_batches,
                          callbacks=callbacks,
                          verbose=verbose)

###########################
# Plot the learning curve #
###########################
fig, ax = plt.subplots()
ax.plot(history.history['loss'], label='Train')
ax.plot(history.history['val_loss'], label='Val')
ax.set_xlabel('epochs')
ax.set_ylabel('MSE')
ax.set_title('Train and Validation Loss')
ax.legend()

fig.savefig(os.path.join(image_path, 'learning_curve.png'))

##################
# Save the model #
##################
model_path = os.path.join(out_path, 'transformer-ball-bounce', 'my_transformer_model.weights.h5')
transformer.save_weights(model_path)


# Inference

The `Translator()` class in [Run inference](https://www.tensorflow.org/text/tutorials/transformer#run_inference) had to be modified for the numerical data. The smaller modifications were mostly in using the `vectorize_layer` `tf.keras.layers.TextVectorization()` object instead of the default `tokenizer` object from the tutorial. We also created the function `get_translations()` for the different x, y, and z data.

However, one large change within the `Translator.__call__()` method is that we gave the whole sentence as context to the translator before the loop. We do this so that the prediction is for the t+1 "word" instead of all the individual "words" since we already know the "translation" for each "word" in the "sentence". We also modified the loop so that at the end of it we shift the "sentence" over with the t+1 "word" since we now want the t+2 "word" and so on. 

This is different than the original tutorial since the original tutorial is interested in translating all the individual "words" within the sentence when we only want the t+1 "word". Thus they only give the first word for context instead of the whole sentence since at each loop it translates the next "word" within the sentence rather than t+1 "word".

In [None]:
# You can also load it for other workflows
model_path = os.path.join(out_path, 'transformer-ball-bounce', 'my_transformer_model.weights.h5')
transformer.load_weights(model_path)

class Translator(tf.Module):
  def __init__(self, vectorize_layer, transformer):
    self.vectorize_layer = vectorize_layer
    self.transformer = transformer

  def __call__(self, sentence, max_length):
    assert isinstance(sentence, tf.Tensor)
    if len(sentence.shape) == 0:
      sentence = sentence[tf.newaxis]

    sentence = self.vectorize_layer(sentence)

    encoder_input = sentence

    # Don't need `[START]` and `[END]` tokens since this isn't an actual language with sentences start and end.
    output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)

    # Since we already know the "translation" for each "word" in the "sentence" we just want to predict t+1 "word"
    # We give the whole sentence as context so it only predicts the t+1 "word" instead of all the individual "words"
    for i in range(sentence.shape[1]):  
        output_array = output_array.write(i, tf.reshape(sentence[0,i],[-1]))
        
    # Keep track of t+1 "words" for each "sentence"
    output_array_all = []

    for i in tf.range(max_length): 
        if i % int(max_length*.1) == 0:
            print(f"Timestep {i+1} of {max_length}")

        output = tf.transpose(output_array.stack())
        predictions = self.transformer([encoder_input, output], training=False)

        # Select the last token from the `seq_len` dimension.
        predictions = predictions[:, -1:, :] # Shape `(batch_size, 1, vocab_size)`.

        predicted_id = tf.argmax(predictions, axis=-1)

        # Shift sentence over with predicted_id t+1 "words"
        encoder_input = tf.reshape(tf.expand_dims(tf.concat([encoder_input[0,1:],predicted_id[0]],axis=0),-1),[1,-1])
        # Create new output_array that has shifted sentence
        output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
        for j in range(encoder_input.shape[1]):
            output_array = output_array.write(j, tf.reshape(encoder_input[0,j],[-1]))
        # Keep track of t+1 "words" for each "sentence"
        output_array_all.append(int(encoder_input[0,-1]))

    # The output shape is `(1, tokens)`.
    vocab = vectorize_layer.get_vocabulary()
    text = " ".join([vocab[each] for each in output_array_all])

    tokens = output_array_all

    return text, tokens


translator = Translator(vectorize_layer, transformer)


def print_translation(sentence, translated_text, ground_truth):
    print(f'Input {len(sentence.split())}:\n{sentence}')
    print(f'Prediction {len(translated_text.split())}:\n{translated_text}')
    print(f'Ground truth {len(ground_truth.split())}:\n{ground_truth}')
    translated_text_float = [float(x) for x in translated_text.split(" ")]
    ground_truth_float = [float(x) for x in ground_truth.split()]
    diff = np.abs(np.array(ground_truth_float) - np.array(translated_text_float))
    try:
        index = np.nonzero(diff)[0][0]
    except:
        index = np.nan
    diff = " ".join(f"{x:.{sig_figs}f}" for x in diff)
    print(f'Abs(Ground truth - Prediction):\n{diff}')
    print(f'First non-zero index: {index}')

def get_translations(X_temp_scaled):
    
    def sentence_translation(X_temp_scaled_i):
        sentence = " ".join(f"{x:.{sig_figs}f}" for x in X_temp_scaled_i[:MAX_TOKENS])
        ground_truth = " ".join(f"{x:.{sig_figs}f}" for x in X_temp_scaled_i[MAX_TOKENS:])

        translated_text, translated_tokens = translator(tf.constant(sentence),
                                                        max_length=len(time)-MAX_TOKENS)

        print_translation(sentence, translated_text, ground_truth)
        
        translated = [float(x) for x in translated_text.split()]
        
        return translated
    
    # Converting "sentence" to list of floats
    print("\n-----x data-----\n")
    translated_x = sentence_translation(X_temp_scaled[0].transpose()[0])
    print("\n-----y data-----\n")
    translated_y = sentence_translation(X_temp_scaled[0].transpose()[1])
    print("\n-----z data-----\n")
    translated_z = sentence_translation(X_temp_scaled[0].transpose()[2])

    return translated_x, translated_y, translated_z

print("\n\n----------Train Data:----------\n\n")
translated_x_train, translated_y_train, translated_z_train = get_translations(X_train_scaled)
print("\n\n----------Validation Data:----------\n\n")
translated_x_val, translated_y_val, translated_z_val = get_translations(X_val_scaled)
print("\n\n----------Test Data:----------\n\n")
translated_x_test, translated_y_test, translated_z_test = get_translations(X_test_scaled)


# Inference Plotting

Here we plot the predictions of the transformer.

In [None]:
print("ORIGINAL SCALED:\n")
print(X_train_scaled[0])
original_train = scaler.inverse_transform(X_train_scaled[0])
print("ORIGINAL UNSCALED:\n")
print(original_train)

original_val = scaler.inverse_transform(X_val_scaled[0])
original_test = scaler.inverse_transform(X_test_scaled[0])

X_predicted_train = np.array([translated_x_train,translated_y_train,translated_z_train]).transpose()
X_whole_train_unscaled = scaler.inverse_transform(X_predicted_train)
print(f"PREDICTED UNSCALED len(time) - {len(time)} - MAX_TOKENS: {MAX_TOKENS}: {len(time) - MAX_TOKENS}\n")
# print(X_whole_train_unscaled)}

X_predicted_val = np.array([translated_x_val,translated_y_val,translated_z_val]).transpose()
X_whole_val_unscaled = scaler.inverse_transform(X_predicted_val)

X_predicted_test = np.array([translated_x_test,translated_y_test,translated_z_test]).transpose()
X_whole_test_unscaled = scaler.inverse_transform(X_predicted_test)

#######################
# Plot the prediction #
#######################

fig, ax = plt.subplots(nrows=3,ncols=3,sharex=True,figsize=(9,9))
fig.suptitle('Whole Prediction')

ax[0,0].plot(time,original_train[:,0], label='Original')
ax[0,0].plot(time[0:MAX_TOKENS],original_train[0:MAX_TOKENS,0], label='Initial Guess')
ax[0,0].plot(time[MAX_TOKENS:], X_whole_train_unscaled[:,0], label='Whole Prediction')
ax[0,1].plot(time,original_val[:,0], label='Original')
ax[0,1].plot(time[0:MAX_TOKENS],original_val[0:MAX_TOKENS,0], label='Initial Guess')
ax[0,1].plot(time[MAX_TOKENS:], X_whole_val_unscaled[:,0], label='Whole Prediction')
ax[0,2].plot(time,original_test[:,0], label='Original')
ax[0,2].plot(time[0:MAX_TOKENS],original_test[0:MAX_TOKENS,0], label='Initial Guess')
ax[0,2].plot(time[MAX_TOKENS:], X_whole_test_unscaled[:,0], label='Whole Prediction')
ax[0,0].legend(fontsize='xx-small')
ax[0,0].set_title('Train x_pos')
ax[0,1].legend(fontsize='xx-small')
ax[0,1].set_title('Validation x_pos')
ax[0,2].legend(fontsize='xx-small')
ax[0,2].set_title('Test x_pos')

ax[1,0].plot(time,original_train[:,1], label='Original')
ax[1,0].plot(time[0:MAX_TOKENS],original_train[0:MAX_TOKENS,1], label='Initial Guess')
ax[1,0].plot(time[MAX_TOKENS:], X_whole_train_unscaled[:,1], label='Whole Prediction')
ax[1,1].plot(time,original_val[:,1], label='Original')
ax[1,1].plot(time[0:MAX_TOKENS],original_val[0:MAX_TOKENS,1], label='Initial Guess')
ax[1,1].plot(time[MAX_TOKENS:], X_whole_val_unscaled[:,1], label='Whole Prediction')
ax[1,2].plot(time,original_test[:,1], label='Original')
ax[1,2].plot(time[0:MAX_TOKENS],original_test[0:MAX_TOKENS,1], label='Initial Guess')
ax[1,2].plot(time[MAX_TOKENS:], X_whole_test_unscaled[:,1], label='Whole Prediction')
ax[1,0].legend(fontsize='xx-small')
ax[1,0].set_title('Train y_pos')
ax[1,1].legend(fontsize='xx-small')
ax[1,1].set_title('Validation y_pos')
ax[1,2].legend(fontsize='xx-small')
ax[1,2].set_title('Test y_pos')

ax[2,0].plot(time,original_train[:,2], label='Original')
ax[2,0].plot(time[0:MAX_TOKENS],original_train[0:MAX_TOKENS,2], label='Initial Guess')
ax[2,0].plot(time[MAX_TOKENS:], X_whole_train_unscaled[:,2], label='Whole Prediction')
ax[2,1].plot(time,original_val[:,2], label='Original')
ax[2,1].plot(time[0:MAX_TOKENS],original_val[0:MAX_TOKENS,2], label='Initial Guess')
ax[2,1].plot(time[MAX_TOKENS:], X_whole_val_unscaled[:,2], label='Whole Prediction')
ax[2,2].plot(time,original_test[:,2], label='Original')
ax[2,2].plot(time[0:MAX_TOKENS],original_test[0:MAX_TOKENS,2], label='Initial Guess')
ax[2,2].plot(time[MAX_TOKENS:], X_whole_test_unscaled[:,2], label='Whole Prediction')
ax[2,0].legend(fontsize='xx-small')
ax[2,0].set_title('Train z_pos')
ax[2,1].legend(fontsize='xx-small')
ax[2,1].set_title('Validation z_pos')
ax[2,2].legend(fontsize='xx-small')
ax[2,2].set_title('Test z_pos')

fig.tight_layout()
fig.savefig(os.path.join(image_path, 'whole_prediction.png'))