In [None]:
import tensorflow as tf

import numpy as np
import os
import time

In [None]:
path_to_file = "dataset_loda_programs.csv"

In [None]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print(f'Length of text: {len(text)} characters')


Length of text: 11575297 characters


In [None]:
# Take a look at the first 250 characters in text
print(text[:250])


mov $1,1\nlpb $0\nsub $0,1\nmov $2,$3\nmul $2,4\nmul $3,6\nadd $3,$1\nmov $1,$2\nlpe\nmov $0,$3
mov $1,$0\nmin $1,1\nseq $0,66645\nadd $0,$1
mov $3,$0\npow $0,0\nsub $0,1\nlpb $3\nmov $2,$0\nmul $2,$3\nadd $2,1\nseq $2,22087\nsub $3,1\nadd $0,1\nadd 


In [None]:
# The unique characters in the file
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')


38 unique characters


In [None]:
example_texts = ['mov $0,4', 'add']

chars = tf.strings.unicode_split(example_texts, input_encoding='UTF-8')
chars

<tf.RaggedTensor [[b'm', b'o', b'v', b' ', b'$', b'0', b',', b'4'], [b'a', b'd', b'd']]>

In [None]:
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)


In [None]:
ids = ids_from_chars(chars)
ids

<tf.RaggedTensor [[27, 29, 36, 2, 4, 7, 5, 11], [18, 21, 21]]>

In [None]:
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)


In [None]:
chars = chars_from_ids(ids)
chars

<tf.RaggedTensor [[b'm', b'o', b'v', b' ', b'$', b'0', b',', b'4'], [b'a', b'd', b'd']]>

In [None]:
tf.strings.reduce_join(chars, axis=-1).numpy()


array([b'mov $0,4', b'add'], dtype=object)

In [None]:
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)


In [None]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(11575297,), dtype=int64, numpy=array([27, 29, 36, ...,  5, 16,  1])>

In [None]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)


In [None]:
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))


m
o
v
 
$
1
,
1
\
n


In [None]:
seq_length = 100


In [None]:
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq))


tf.Tensor(
[b'm' b'o' b'v' b' ' b'$' b'1' b',' b'1' b'\\' b'n' b'l' b'p' b'b' b' '
 b'$' b'0' b'\\' b'n' b's' b'u' b'b' b' ' b'$' b'0' b',' b'1' b'\\' b'n'
 b'm' b'o' b'v' b' ' b'$' b'2' b',' b'$' b'3' b'\\' b'n' b'm' b'u' b'l'
 b' ' b'$' b'2' b',' b'4' b'\\' b'n' b'm' b'u' b'l' b' ' b'$' b'3' b','
 b'6' b'\\' b'n' b'a' b'd' b'd' b' ' b'$' b'3' b',' b'$' b'1' b'\\' b'n'
 b'm' b'o' b'v' b' ' b'$' b'1' b',' b'$' b'2' b'\\' b'n' b'l' b'p' b'e'
 b'\\' b'n' b'm' b'o' b'v' b' ' b'$' b'0' b',' b'$' b'3' b'\n' b'm' b'o'
 b'v' b' ' b'$'], shape=(101,), dtype=string)


In [None]:
for seq in sequences.take(5):
  print(text_from_ids(seq).numpy())


b'mov $1,1\\nlpb $0\\nsub $0,1\\nmov $2,$3\\nmul $2,4\\nmul $3,6\\nadd $3,$1\\nmov $1,$2\\nlpe\\nmov $0,$3\nmov $'
b'1,$0\\nmin $1,1\\nseq $0,66645\\nadd $0,$1\nmov $3,$0\\npow $0,0\\nsub $0,1\\nlpb $3\\nmov $2,$0\\nmul $2,$3\\n'
b'add $2,1\\nseq $2,22087\\nsub $3,1\\nadd $0,1\\nadd $1,$2\\nlpe\\nmov $0,$1\\ndiv $0,4\\nadd $0,1\nmul $0,6\\na'
b'dd $0,5\\ndiv $0,4\\npow $0,2\nlpb $0\\nadd $2,1\\nsub $0,$2\\nlpe\\nsub $0,1\\nadd $2,1\\nsub $2,$0\\nmax $3,$'
b'0\\nmov $4,$0\\nmov $0,$2\\nlpb $0\\nsub $0,2\\nadd $3,$0\\nbin $3,$0\\nadd $1,$3\\nmov $3,$4\\nlpe\\nmov $0,$1'


In [None]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [None]:
split_input_target(list("Tensorflow"))


(['T', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w'])

In [None]:
dataset = sequences.map(split_input_target)


In [None]:
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())


Input : b'mov $1,1\\nlpb $0\\nsub $0,1\\nmov $2,$3\\nmul $2,4\\nmul $3,6\\nadd $3,$1\\nmov $1,$2\\nlpe\\nmov $0,$3\nmov '
Target: b'ov $1,1\\nlpb $0\\nsub $0,1\\nmov $2,$3\\nmul $2,4\\nmul $3,6\\nadd $3,$1\\nmov $1,$2\\nlpe\\nmov $0,$3\nmov $'


In [None]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

In [None]:
# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024


In [None]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [None]:
model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)


In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")


(64, 100, 39) # (batch_size, sequence_length, vocab_size)


In [None]:
model.summary()


Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  9984      
                                                                 
 gru (GRU)                   multiple                  3938304   
                                                                 
 dense (Dense)               multiple                  39975     
                                                                 
Total params: 3,988,263
Trainable params: 3,988,263
Non-trainable params: 0
_________________________________________________________________


In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()


In [None]:
sampled_indices

array([37,  0,  0,  4, 17, 32, 24,  0, 12, 34,  9, 10,  9, 20,  0,  6, 27,
       31, 23, 10, 19,  4, 26, 17,  1, 18, 31, 24, 15, 33, 31,  6, 37,  6,
       14,  4,  7, 30, 24,  2, 10, 11,  2, 31, 17, 37, 16,  6, 22, 33, 27,
       38, 30, 27,  3, 21,  7, 27,  6, 32, 24, 38, 38, 12, 19, 18, 30, 36,
       26, 22, 12,  1, 25,  9,  9,  8, 23, 11,  0, 35, 15,  2, 26, 19, 11,
       17, 31, 16, 32, 10,  8, 16, 12, 15, 28, 23, 11, 37, 34, 31])

In [None]:
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())


Input:
 b'ax $4,0\\ncmp $4,$0\\nmul $2,$4\\ntrn $2,1\\nlpe\\nmov $0,$5\\nadd $0,1\nadd $0,1\\nlpb $0\\nsub $0,1\\nmov $2'

Next Char Predictions:
 b'w[UNK][UNK]$\\rg[UNK]5t232c[UNK]-mqf3b$l\\\naqg8sq-w-7$0pg 34 q\\w9-esmxpm"d0m-rgxx5bapvle5\ni221f4[UNK]u8 lb4\\q9r31958nf4wtq'


In [None]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)


In [None]:
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)


Prediction shape:  (64, 100, 39)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(3.663986, shape=(), dtype=float32)


In [None]:
tf.exp(example_batch_mean_loss).numpy()


39.016552

In [None]:
model.compile(optimizer='adam', loss=loss)


In [None]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)


In [None]:
EPOCHS = 20


In [None]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states


In [None]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)


In [None]:
start = time.time()
states = None
next_char = tf.constant(['mov $'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)


mov $5,6\nmov $6,$3\nsub $6,$7\nadd $5,$1\nmov $6,$7\nadd $5,1\nadd $1,$6\nmov $2,$3\nmov $3,$2\nmov $2,$1\nseq $3,1262209\nadd $3,1\nadd $0,$3\ndiv $0,8\nadd $0,1
pow $0,2\nmul $0,3\nsub $0,1
add $0,1\nmov $1,10\npow $1,$0\nmov $0,$1\nadd $0,2
mov $5,14\nmov $2,$0\nadd $2,2\nmul $2,24\nsub $0,1\nadd $1,$4\nmul $1,$0\nadd $3,$5\nmov $7,$6\nadd $7,$9\nadd $9,$10\nadd $9,$5\nmov $7,2\nlpb $0\nadd $5,1\nmul $7,$$9\nsub $3,$4\nmov $4,$0\nmax $4,0\ncmp $4,$0\nmul $2,$4\ntrn $2,1\nlpe\nmov $0,$5
mov $1,2\npow $1,$0\nadd $1,1\nseq $0,40\nmul $0,$1
add $0,2\npow $0,6\nsub $0,18\nmul $0,6\nadd $0,115
mov $1,1\nadd $0,1\nlpb $0\nadd $1,1\nsub $0,$1\nmov $2,$0\nlpe\nmov $0,$1\nadd $0,1
seq $0,158446\nsub $0,1\nseq $0,5
seq $0,159557\nmod $0,9
seq $0,32739\nlpb $0\nmul $0,2\ndif $0,0\nlpe\ndiv $0,5
mov $1,1\nmov $2,2\nadd $0,1\nlpb $0\nsub $0,1\nmov $3,$2\nmul $3,4\nsub $1,$3\nadd $3,$1\nmov $4,$1\nadd $4,$2\nmov $4,$3\nmov $3,$1\nlpe\nmov $0,$2
lpb $0\nsub $0,1\nadd $4,$2\nadd $1,$3\nadd $2,$4\nm

In [None]:
start = time.time()
states = None
next_char = tf.constant(['mov $1,$0', 'add $0,', 'mov $1,$0'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result, '\n\n' + '_'*80)
print('\nRun time:', end - start)


tf.Tensor(
[b'mov $1,$0\\nmov $4,1\\nsub $0,1\\nmov $2,$0\\nlpb $2\\nmov $3,$2\\ngcd $3,$0\\ncmp $3,$2\\ncmp $3,0\\nmul $3,$0\\npow $3,2\\nmov $5,$3\\nmod $5,16\\ncmp $5,0\\ndiv $5,321\\nlpb $5\\ndiv $5,10\\nmov $3,8\\nbin $3,$1\\nmul $3,$2\\nadd $1,1\\nadd $5,$3\\nlpe\\nmov $0,$5\nmov $1,6\\npow $1,$0\\nmul $0,$1\\nadd $0,2\nmov $1,$0\\nseq $1,278573\\nseq $0,138220\\nmul $1,$0\\nmov $0,$1\\ndiv $0,4\nmov $2,$0\\nadd $2,2\\npow $2,3\\nlpb $2\\nsub $2,1\\nadd $0,$1\\nadd $0,1\\nsub $2,$0\\nbin $2,$0\\nmov $3,$4\\nbin $3,$1\\nadd $1,1\\nmul $3,$2\\ndiv $3,$1\\nadd $5,1\\nadd $5,$3\\nlpe\\nmov $0,$5\\nsub $0,1\nmov $1,1\\nmov $4,1\\nmov $5,$0\\nadd $5,1\\nlpb $5\\nsub $5,1\\nmov $0,$3\\nseq $0,5\\nsub $0,1\\nmov $1,$2\\nlpe\\nmov $0,$8\\nadd $0,1\nmov $5,$0\\nadd $5,1\\nlpb $5\\nsub $5,1\\nmov $0,$3\\nseq $0,56649\\nsub $0,$4\\nadd $1,$4\\nmov $4,$0\\nlpe\\nmov $0,$4\\nadd $0,1\nmov $1,1\\nmov $2,$0\\nadd $2,6\\npow $2,3\\nlpb $2\\nmov $3,$6\\nseq $3,10061\\nsub $3,1\\ncmp $3,5\\nsub $0,

In [None]:
tf.saved_model.save(one_step_model, 'one_step')
one_step_reloaded = tf.saved_model.load('one_step')




In [None]:
states = None
next_char = tf.constant(['lpb $0'])
result = [next_char]

for n in range(100):
  next_char, states = one_step_reloaded.generate_one_step(next_char, states=states)
  result.append(next_char)

print(tf.strings.join(result)[0].numpy().decode("utf-8"))


lpb $0\nadd $1,2\nsub $0,$1\nlpe\nsub $1,$0\nmul $1,2\nmov $4,$3\ndiv $4,2\nlpe\nsub $0,1\nlpe\nmov $0,$2

