# STD LSTM (tensorflow)

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Hyperparameters
num_samples = 10000   # total number of examples
T = 100                # length of the sequence to copy (you can experiment with {100, 200, 500, 1000})
vocab_size = 10       # tokens 1..10 (e.g., letters a-j)
hidden_size = 128
batch_size = 128
epochs = 20

# Define special tokens:
# 0 will be used as the blank token.
# The delimiter token will be vocab_size + 1.
delimiter_token = vocab_size + 1

def generate_copy_data(num_samples, T, vocab_size):
    """
    Generate data for the copy task.
    For each sample:
      - Generate a random sequence of T tokens (from 1 to vocab_size).
      - Create an input sequence of length 2*T + 1:
           [random sequence] + [delimiter] + [T blanks (0)]
      - Create a target sequence of the same length:
           [T+1 blanks] + [the original random sequence]
    The model is only evaluated on its output in the last T time steps.
    """
    seq_length = 2 * T + 1
    X = np.zeros((num_samples, seq_length), dtype=np.int32)
    Y = np.zeros((num_samples, seq_length), dtype=np.int32)

    for i in range(num_samples):
        # Generate a random sequence (tokens from 1 to vocab_size)
        random_seq = np.random.randint(1, vocab_size + 1, size=T)
        # Build input: first T tokens are the random sequence
        X[i, :T] = random_seq
        # Then one delimiter token
        X[i, T] = delimiter_token
        # Followed by T blank tokens (0)
        X[i, T+1:] = 0

        # Build target: first T+1 positions are blanks (0)
        Y[i, :T+1] = 0
        # Last T positions are the original sequence
        Y[i, T+1:] = random_seq
    return X, Y

# Generate data
X, Y = generate_copy_data(num_samples, T, vocab_size)

# Split data into training (70%) and validation (30%)
split_index = int(0.7 * num_samples)
X_train, Y_train = X[:split_index], Y[:split_index]
X_val, Y_val = X[split_index:], Y[split_index:]

# Create tf.data.Dataset objects
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, Y_train))
train_dataset = train_dataset.shuffle(1000).batch(batch_size)
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, Y_val)).batch(batch_size)

# Define the LSTM model.
# Note: This uses the built-in Keras LSTM layer.
class LSTMModel(tf.keras.Model):
    def __init__(self, vocab_size, hidden_size):
        super(LSTMModel, self).__init__()
        # The embedding layer input dimension is vocab_size + 2 because
        # we use 0 for blank and (vocab_size+1) for the delimiter.
        self.embedding = tf.keras.layers.Embedding(vocab_size + 2, hidden_size)
        # Using Keras' LSTM; if you want to use your custom LSTMCell, wrap it with tf.keras.layers.RNN.
        self.lstm = tf.keras.layers.LSTM(hidden_size, return_sequences=True, return_state=True)
        # The dense layer maps to (vocab_size + 1) outputs.
        # We exclude the delimiter from the outputs as the target tokens are in the range [0, vocab_size]
        self.dense = tf.keras.layers.Dense(vocab_size + 1, activation='softmax')

    def call(self, inputs, states=None, training=False):
        x = self.embedding(inputs)
        if states is None:
            x, h, c = self.lstm(x)
        else:
            x, h, c = self.lstm(x, initial_state=states)
        output = self.dense(x)
        return output

# Instantiate the model
model = LSTMModel(vocab_size, hidden_size)

# Compile the model using sparse categorical crossentropy.
# The model output shape is (batch, time, vocab_size+1) and targets are integers.
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(train_dataset, epochs=epochs, validation_data=val_dataset)

# Plot training and validation loss and accuracy
plt.figure(figsize=(12, 5))

# Plot Loss
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], marker='o', label='Train Loss')
plt.plot(history.history['val_loss'], marker='o', label='Validation Loss')
plt.title('Loss over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# Plot Accuracy
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], marker='o', label='Train Accuracy')
plt.plot(history.history['val_accuracy'], marker='o', label='Validation Accuracy')
plt.title('Accuracy over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()


Epoch 1/20
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 796ms/step - accuracy: 0.5091 - loss: 1.4470 - val_accuracy: 0.5517 - val_loss: 1.1492
Epoch 2/20
[1m10/55[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m31s[0m 709ms/step - accuracy: 0.5515 - loss: 1.1491

In [25]:
# Mapping for letters (a-j) to token integers and vice versa
letter_to_token = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10}
token_to_letter = {v: k for k, v in letter_to_token.items()}

# Define special tokens
delimiter_token = 11  # As used in our training (vocab_size + 1)
blank_token = 0       # Blank token

# Example custom input: The sequence (letters) followed by a delimiter "=".
# For instance, "ababdceijhg=" represents the sequence to copy.
custom_input = "ababdceijhg="

# Ensure the input ends with the delimiter marker '='
if not custom_input.endswith('='):
    raise ValueError("Custom input must end with '=' to indicate the delimiter.")

# Remove the delimiter from the string to obtain the original sequence to be copied
original_seq_str = custom_input[:-1]

# Convert each letter in the sequence to its corresponding token
input_seq_tokens = [letter_to_token[letter] for letter in original_seq_str]

# For the copy task, the full input sequence for the model should be:
# [original sequence tokens] + [delimiter token] + [T blank tokens],
# where T is the length of the original sequence.
T_custom = len(original_seq_str)
model_input_tokens = input_seq_tokens + [delimiter_token] + [blank_token] * T_custom

# Convert to a numpy array and add a batch dimension (shape: [1, sequence_length])
import numpy as np
model_input = np.array(model_input_tokens).reshape(1, -1)

# Get predictions from the model.
# The model output shape is (batch, time, vocab_size+1), and we take argmax over the last axis.
predictions = model.predict(model_input)
predicted_tokens = np.argmax(predictions, axis=-1)[0]

# The model is only expected to produce the copied sequence in the last T time steps.
predicted_seq_tokens = predicted_tokens[T_custom+1:]

# Convert predicted tokens back to letters
predicted_seq_letters = ''.join(token_to_letter.get(token, '') for token in predicted_seq_tokens)

print("Custom Input:", custom_input)
print("Predicted Output:", predicted_seq_letters)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 198ms/step
Custom Input: ababdceijhg=
Predicted Output: gggddddjjjj
