## Imports

In [1]:
import tensorflow as tf
import os

## Dataset preparation

In [13]:
feature_description = {
    "moves": tf.io.FixedLenFeature([], tf.string, default_value=''),
    "white_elo": tf.io.FixedLenFeature([], tf.int64, default_value=0),
    "black_elo": tf.io.FixedLenFeature([], tf.int64, default_value=0),
    "result": tf.io.FixedLenFeature([], tf.string, default_value=''),
}

def parse_example(example_proto):
    example = tf.io.parse_example(example_proto, feature_description)
    return example
    
TRAINING_DATA_DIR = "data/training_data"
SHUFFLE_BUFFER_SIZE = 1024

data_files = [f"{TRAINING_DATA_DIR}/{file}" for file in os.listdir(TRAINING_DATA_DIR)]
dataset = tf.data.TFRecordDataset(filenames=data_files,
                                  compression_type="GZIP",
                                  num_parallel_reads=4)
dataset = dataset.map(parse_example).shuffle(SHUFFLE_BUFFER_SIZE)

In [None]:
def get_size(dataset):
    length = 0
    for item in dataset:
        length += 1
    return length

TRAIN_SPLIT = 0.8
AUTOTUNE = tf.data.AUTOTUNE

dataset_size = get_size(dataset)
train_size = int(dataset_size * TRAIN_SPLIT)
print(f"Data set size: {dataset_size}")
print(f"Train split size: {train_size}")
print(f"Validation split size: {dataset_size - train_size}")
train_dataset = dataset.take(train_size).cache().prefetch(buffer_size=AUTOTUNE)
val_dataset = dataset.skip(train_size).cache().prefetch(buffer_size=AUTOTUNE)

## InputEmbedding Layer

In [17]:
class InputEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocabulary, embedding_dim):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.vectorize_layer = tf.keras.layers.TextVectorization(
            output_mode="int",
            vocabulary=vocabulary,
            standardize=None,
            split="whitespace"
        )
        vocab_size = self.vectorize_layer.vocabulary_size()
        self.embedding_layer = tf.keras.layers.Embedding(
            input_dim=vocab_size, output_dim=embedding_dim)

    def call(self, x):
        x = self.vectorize_layer(x)
        x = self.embedding_layer(x)
        x *= tf.math.sqrt(tf.cast(self.embedding_dim, tf.float32))
        return x

In [19]:
pieces = ["p", "n", "b", "r", "q", "k"]
rank_names = ["1", "2", "3", "4", "5", "6", "7", "8"]
file_names = ["a", "b", "c", "d", "e", "f", "g", "h"]
squares = [f + r for r in rank_names for f in file_names]
promotions = ["-", "=n", "=b", "=r", "=q"]
vocabulary = pieces + squares + promotions

embedding = InputEmbedding(vocabulary, 2)
for example in dataset.batch(1).take(1):
    print(embedding(example["moves"]))

tf.Tensor(
[[[ 0.02544026  0.04513648]
  [-0.06696517  0.06143199]
  [ 0.0074494  -0.03557823]
  [ 0.05222796  0.05570115]
  [ 0.02544026  0.04513648]
  [-0.05268295  0.0589317 ]
  [ 0.03350848  0.01162371]
  [ 0.05222796  0.05570115]
  [ 0.02544026  0.04513648]
  [-0.05334675 -0.00995212]
  [ 0.02112434 -0.03747362]
  [ 0.05222796  0.05570115]
  [ 0.02544026  0.04513648]
  [ 0.03350848  0.01162371]
  [ 0.02112434 -0.03747362]
  [ 0.05222796  0.05570115]
  [ 0.02544026  0.04513648]
  [ 0.02345442 -0.03764417]
  [ 0.02825521  0.04250508]
  [ 0.05222796  0.05570115]
  [ 0.02544026  0.04513648]
  [ 0.02112434 -0.03747362]
  [ 0.02825521  0.04250508]
  [ 0.05222796  0.05570115]
  [-0.03626743  0.02851148]
  [-0.02902353  0.01916149]
  [-0.0570095  -0.01166778]
  [ 0.05222796  0.05570115]
  [ 0.02544026  0.04513648]
  [ 0.02825521  0.04250508]
  [ 0.02152756 -0.06275958]
  [ 0.05222796  0.05570115]
  [-0.03626743  0.02851148]
  [ 0.06702362  0.01268755]
  [ 0.02152756 -0.06275958]
  [ 0.052