In [15]:
import json

def build_vocab():
    """
    Build vocabulary for BoardGPT:
    - 60 positions (8x8 board minus 4 starting squares for Othello)
    - Special tokens <pad>, <bos>, <eos>
    """
    # Generate all positions
    positions = [f"{c}{r}" for c in "abcdefgh" for r in range(1, 9)]
    # Remove the 4 starting squares (Othello)
    for start in ["d4", "e5", "d5", "e4"]:
        positions.remove(start)
    # end for

    specials = ["<pad>", "<bos>", "<eos>"]
    tokens = specials + positions

    # Map tokens to integer IDs
    vocab = {tok: i for i, tok in enumerate(tokens)}

    # Save to vocab.json
    with open("vocab.json", "w") as f:
        json.dump(vocab, f, indent=2)
    # end with

    return vocab
# end def build_vocab

In [26]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from transformers import PreTrainedTokenizerFast
from tokenizers.pre_tokenizers import Whitespace


def build_tokenizer(vocab: dict, save_path="tokenizer.json"):
    """
    Build a HuggingFace-compatible tokenizer from a fixed vocab.
    Saves a tokenizer.json file that can be reloaded later.
    """
    tokenizer = Tokenizer(WordLevel(vocab=vocab, unk_token="<pad>"))
    tokenizer.pre_tokenizer = Whitespace()  # 🔑 split on spaces
    tokenizer.save(save_path)

    fast_tok = PreTrainedTokenizerFast(
        tokenizer_file=save_path,
        bos_token="<bos>",
        eos_token="<eos>",
        pad_token="<pad>",
        unk_token="<pad>",
    )
    return fast_tok

In [27]:
vocab = build_vocab()

In [28]:
vocab

{'<pad>': 0,
 '<bos>': 1,
 '<eos>': 2,
 'a1': 3,
 'a2': 4,
 'a3': 5,
 'a4': 6,
 'a5': 7,
 'a6': 8,
 'a7': 9,
 'a8': 10,
 'b1': 11,
 'b2': 12,
 'b3': 13,
 'b4': 14,
 'b5': 15,
 'b6': 16,
 'b7': 17,
 'b8': 18,
 'c1': 19,
 'c2': 20,
 'c3': 21,
 'c4': 22,
 'c5': 23,
 'c6': 24,
 'c7': 25,
 'c8': 26,
 'd1': 27,
 'd2': 28,
 'd3': 29,
 'd6': 30,
 'd7': 31,
 'd8': 32,
 'e1': 33,
 'e2': 34,
 'e3': 35,
 'e6': 36,
 'e7': 37,
 'e8': 38,
 'f1': 39,
 'f2': 40,
 'f3': 41,
 'f4': 42,
 'f5': 43,
 'f6': 44,
 'f7': 45,
 'f8': 46,
 'g1': 47,
 'g2': 48,
 'g3': 49,
 'g4': 50,
 'g5': 51,
 'g6': 52,
 'g7': 53,
 'g8': 54,
 'h1': 55,
 'h2': 56,
 'h3': 57,
 'h4': 58,
 'h5': 59,
 'h6': 60,
 'h7': 61,
 'h8': 62}

In [29]:
tokenizer = build_tokenizer(vocab)

In [34]:
import torch

# Encode a sequence of moves
text = "<pad> <pad> a1 e2 h8"
ids = tokenizer.encode(text)
tensor = torch.tensor(ids, dtype=torch.long)

In [35]:
tensor

tensor([ 0,  0,  3, 34, 62])