## Imports

In [23]:
import tensorflow as tf
import os

## Dataset preparation

In [144]:
feature_description = {
    "moves": tf.io.FixedLenFeature([], tf.string, default_value=''),
    "white_elo": tf.io.FixedLenFeature([], tf.int64, default_value=0),
    "black_elo": tf.io.FixedLenFeature([], tf.int64, default_value=0),
    "result": tf.io.FixedLenFeature([], tf.string, default_value=''),
}

def parse_example(example_proto):
    example = tf.io.parse_example(example_proto, feature_description)
    return example
    
TRAINING_DATA_DIR = "data/training_data"
SHUFFLE_BUFFER_SIZE = 1024

data_files = [f"{TRAINING_DATA_DIR}/{file}" for file in os.listdir(TRAINING_DATA_DIR)]
dataset = tf.data.TFRecordDataset(filenames=data_files,
                                  compression_type="ZLIB",
                                  num_parallel_reads=4)
dataset = dataset.map(parse_example).shuffle(SHUFFLE_BUFFER_SIZE)

In [145]:
def get_size(dataset):
    length = 0
    for item in dataset:
        length += 1
    return length

TRAIN_SPLIT = 0.8
AUTOTUNE = tf.data.AUTOTUNE

dataset_size = get_size(dataset)
train_size = int(dataset_size * TRAIN_SPLIT)
train_dataset = dataset.take(train_size).cache().prefetch(buffer_size=AUTOTUNE)
val_dataset = dataset.skip(train_size).cache().prefetch(buffer_size=AUTOTUNE)

## InputEmbedding Layer

In [175]:
class InputEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocabulary, embedding_dim):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.vectorize_layer = tf.keras.layers.TextVectorization(
            output_mode="int",
            vocabulary=vocabulary,
            standardize=None,
            split="whitespace"
        )
        vocab_size = vectorize_layer.vocabulary_size()
        self.embedding_layer = tf.keras.layers.Embedding(
            input_dim=vocab_size, output_dim=embedding_dim)

    def call(self, x):
        x = self.vectorize_layer(x)
        x = self.embedding_layer(x)
        x *= tf.math.sqrt(tf.cast(self.embedding_dim, tf.float32))
        return x

In [179]:
pieces = ["p", "n", "b", "r", "q", "k"]
rank_names = ["1", "2", "3", "4", "5", "6", "7", "8"]
file_names = ["a", "b", "c", "d", "e", "f", "g", "h"]
squares = [f + r for r in rank_names for f in file_names]
promotions = ["-", "=n", "=b", "=r", "=q"]
vocabulary = pieces + squares + promotions

embedding = InputEmbedding(vocabulary, 2)
for example in train_dataset.batch(1).take(1):
    print(example["moves"])
    print(vectorize_layer(example["moves"]))
    print(embedding(example["moves"]))

tf.Tensor([b'p e2 e4 - p d7 d5 - p e4 d5 - n g8 f6 - n b1 c3 - p e7 e6 - b f1 c4 - b f8 b4 - q d1 f3 - n b8 d7 - p b2 b3 - n d7 c5 - b c1 b2 - b b4 c3 - b b2 c3 - k e8 g8 - p d5 e6 - b c8 e6 - b c4 e6 - r f8 e8 - n g1 e2 - r e8 e6 - k e1 g1 - q d8 e7 - n e2 f4 - r e6 c6 - b c3 f6 - q e7 f6 - r f1 e1 - n c5 d7 - r e1 e2 - q f6 a1 - r e2 e1 - q a1 e1 -'], shape=(1,), dtype=string)
tf.Tensor(
[[ 2 20 36 72  2 59 43 72  2 36 43 72  3 70 53 72  3  9 26 72  2 60 52 72
   4 13 34 72  4 69 33 72  6 11 29 72  3 65 59 72  2 17 25 72  3 59 42 72
   4 10 17 72  4 33 26 72  4 17 26 72  7 68 70 72  2 43 52 72  4 66 52 72
   4 34 52 72  5 69 68 72  3 14 20 72  5 68 52 72  7 12 14 72  6 67 60 72
   3 20 37 72  5 52 50 72  4 26 53 72  6 60 53 72  5 13 12 72  3 42 59 72
   5 12 20 72  6 53  8 72  5 20 12 72  6  8 12 72]], shape=(1, 136), dtype=int64)
tf.Tensor(
[[[-0.01530797 -0.0104291 ]
  [-0.01870643  0.01906184]
  [ 0.05186082  0.05012989]
  [ 0.06932258  0.00086015]
  [-0.01530797 -0.0104291 ]
  [-

2023-11-05 17:16:55.645227: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
