In [1]:
import chess.pgn
import io
from typing import List
from chess.pgn import Game
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch

import sys
sys.path.insert(0, '../src')  # Adjust as necessary based on your directory structure

from load import load_pgn_chunk
from data import ChessDataset
from cnn import ChessCNN

In [2]:
# Define your file path and chunk size
pgn_file_path = "../dataset/lichess_db_standard_rated_2016-05.pgn"
chunk_size = 10_000  # for example, load the first 100 games

# Load the first chunk
first_chunk = load_pgn_chunk(pgn_file_path, chunk_size=chunk_size, chunk_number=1)
print(f"Loaded {len(first_chunk)} games in the first chunk")

Loaded 10000 games in the first chunk


In [4]:
all_moves = []

for game in first_chunk:
    moves = [move.uci() for move in game.mainline_moves()]
    all_moves.append(moves)

size_in_bytes = sys.getsizeof(all_moves)  # Size of the list structure itself
size_in_bytes += sum(sys.getsizeof(move) for move in all_moves)  # Adding the size of each item

In [9]:
print(f"Total size in bytes: {size_in_bytes}")
print(f"Total size in megabytes: {size_in_bytes / (1024 * 1024)}")

Total size in bytes: 6493016
Total size in megabytes: 6.192222595214844


In [7]:
#for game in first_chunk:
#    print(game)

In [None]:
import sys

# For a list of integers
size_in_bytes = sys.getsizeof(first_chunk)  # Size of the list structure itself
size_in_bytes += sum(sys.getsizeof(item) for item in first_chunk)  # Adding the size of each item

print(f"Total size in bytes: {size_in_bytes}")
print(f"Total size in megabytes: {size_in_bytes / (1024 * 1024)}")


In [None]:
first_chunk

In building the supervised learning part of the model, we'll focus on training a network to predict the next move in a given chess position.

We will therefore process the PGN dataset to extract board positions and corresponding moves. This will involve converting chessboard positions into a suitable numerical format for the CNN.

We will design a CNN that takes the board position as input. The output layer should predict the probability of each possible move. Since there are a limited number of legal moves in any given position, this becomes a multi-class classification problem.

### The Input Layer

The input layer should reflect the board layout (8x8 squares with channels representing different pieces).

### The Output Layer

The output should represent all possible moves. In chess, a common approach is to have a fixed-size array where each entry corresponds to a potential move.

Our first mission is to construct the input and output tensor:
X --> Y

Where X, the input is the board layout at every single game state and Y is the corresponding next move (output).

This is an example of the data for a given game:

```Moves: e2e4 d7d5 e4d5 d8d5 b1c3 d5d8 d2d4 g8f6 g1f3 c8g4 h2h3 g4f3 g2f3 c7c6 f1g2 b8d7 c1e3 e7e6 d1d2 f6d5 c3d5 c6d5 e1c1 f8e7 c2c3 d8c7 c1b1 e8c8 f3f4 c8b8 h1g1 b8a8 g2h1 g7g6 h3h4 e7h4 f2f3 h4e7 d2c2 d7f6 h1g2 f6h5 g2h3 h5f4 e3f4 c7f4 d1f1 f4d6 g1g4 d8f8 f1g1 f7f5 g4g2 e7f6 g2g3 f8g8 h3f1 g8g7 f1d3 h8g8 c2h2 d6b8 h2g2 b8c8 f3f4 c8c6 g2f2 f6h4 g3g6 h4f2 g6g7 g8g7 g1g7 a7a6 g7g8 a8a7 g8h8 c6d7 h8h7 d7h7```

To construct the input we will have to recreate the board layout for every single game state.

### X Tensor (Board State Representation)
The X tensor represents the state of the chess board.

X will be a 3-dimensional tensor, 2-dimensions representing the layout (8x8) and a third dimension represnting the type of chess piece discriminated by color. The tensor can be represented as follows:

$X \in \mathbb{R}^{8 \times 8 \times 12}$

Each element $X_{i,j,k}$ of this tensor can be defined as:

$$\ X_{i,j,k} = 
   \begin{cases} 
   1 & \text{if piece type } k \text{ is present at position } (i, j) \\
   0 & \text{otherwise}
   \end{cases}
\$$

So in essence we're dealing with a 3-dimensional tensor with binary states.

### Y Tensor (Next Move Representation)

There are 64 squares on a chessboard, so there are 64 possible starting points and 64 possible ending points for each move, leading to 64×64=4096 possible moves (including illegal ones, which the model should learn to never predict).

The Y tensor represents the next move for each given game state. We encode each move as a one-hot vector of length 64x64 (representing all possible source and destination squares), the tensor can be represented as:

$Y \in \{0, 1\}^{4096}$

Each element $Y_{l}$ of this tensor, where $l$ corresponds to a combination of source and destination squares, can be defined as:

$$Y_{l} = 
   \begin{cases} 
   1 & \text{if the move corresponds to the index } l \\
   0 & \text{otherwise}
   \end{cases}
\$$

In this representation, the index $l$ is calculated based on the source square and the destination square of the move. For instance, if you flatten the 8x8 board into a 64-element array, then a move from square $a$ to square $b$ would correspond to an index $l = 64 \times a + b$.

### Summary
- **X Tensor**: Represents the board state with a 3D tensor where the dimensions are board height, board width, and number of piece types.
- **Y Tensor**: Represents the move as a one-hot encoded vector in a flattened 2D space of source and destination squares.

In [None]:
# Preprocessing
game_moves = []

for game in first_chunk:
    moves = [move.uci() for move in game.mainline_moves()]

    game_moves.append(moves)

In [None]:
dataset = ChessDataset(game_moves)

In [None]:
# Checksum
#for move in dataset.moves:
#    assert(sum(move) == 1.0)

In [None]:
#dataset.check_layout(0)

In [None]:
class ChessCNN(nn.Module):
    def __init__(self):
        super(ChessCNN, self).__init__()

        # We do not add padding. This approach respects the inherent structure of the
        # chessboard and avoids introducing artificial concepts that are not present in
        # the actual game. It allows the model to focus on learning meaningful spatial
        # relationships and patterns that are genuinely indicative of chess strategies.
        #
        # Since we do not add padding, our spatial dimensions will be squashed to
        # a 7x7 versus the original 8x8 layout
        self.conv1 = nn.Conv2d(12, 64, kernel_size=2, padding=0)

        # In the second layer we have a bigger kernel size to capture higher-level patterns
        # and add padding of 1 to keep the spatial resolution from being further decreased
        self.conv2 = nn.Conv2d(64, 124, kernel_size = 3, padding=1)

        # We now move to the fully connected layer. We do not apply a pooling because there is no
        # need for downsampling as our tensors are very reasonable in size
        # Our tensor currently has the following shape: 7x7x124
        #
        # So we will need to flatten the tensor into one dimensional tensor with a size of: 7 * 7 * 124 = 6076
        # This hidden layer will output 2048 neurons. We will afterwards test this number to balance it against under and overfitting
        self.fc1 = nn.Linear(7 * 7 * 124, 2048)

        self.fc2 = nn.Linear(2048, 4096)

    def forward(self, input):
        # Convolutional Layer 1
        output1 = F.relu(self.conv1(input))
        # Convolutional Layer 2
        output2 = F.relu(self.conv2(output1))
        # Flatten Layer
        # We now flatten the layer from (124, 7, 7) to: (7 * 7 * 124 = 6076)
        output3 = output2.view(7202, -1)  # -1 tells PyTorch to infer the correct size --> torch.Size([7202, 6076])
        
        output4 = F.relu(self.fc1(output3))
        output5 = self.fc2(output4)
        return F.softmax(output5, dim=1) # # torch.Size([7202, 4096])


In [None]:
dataset.positions.shape

In [None]:
dataset.moves.shape

To run your input dataset through the first convolutional layer in PyTorch, we need to make sure that the input tensor is in the correct shape expected by `nn.Conv2d`. The `nn.Conv2d` layer expects the input tensor to have the shape \([N, C, H, W]\), where:

- \( N \) is the batch size,
- \( C \) is the number of channels,
- \( H \) is the height of the image (or in your case, the chessboard),
- \( W \) is the width of the image (or the chessboard).

Given your dataset shape \((7202, 8, 8, 12)\), it seems you have the channels as the last dimension, but they should be the second dimension for `nn.Conv2d`. You will need to rearrange the dimensions of your input tensor to \((N, C, H, W)\), which in your case would be \((7202, 12, 8, 8)\).

In [None]:
import torch.optim as optim

# Define your optimizer, for example, using Adam
optimizer = optim.Adam(model.parameters(), lr=0.001)  # lr is the learning rate


# Trainning Loop
for epoch in range(num_epochs):
    for batch in train_loader:  # Assuming you have a DataLoader for your data
        inputs, labels = batch

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = loss_function(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()


In this training loop:

optimizer.zero_grad() clears old gradients from the last step (otherwise they'll be accumulated).
loss.backward() computes the gradient of the loss w.r.t the parameters (weights) of the model.
optimizer.step() updates the weights.
The learning rate is a critical hyperparameter in training neural networks and needs to be chosen carefully. If it's too high, the model might overshoot the optimal solution. If it's too low, training can be slow, or the model might get stuck in a local minimum.

# As we get a sense that the loss is starting to plateau off
# we can apply a learning decay
lr = 0.01

for i in range(10_000):
    # Minibatch construct
    ix = torch.randint(0, X.shape[0], (32,))
    
    # Forward pass
    emb = C[X[ix]] # (32, 3, 2)The input matrix X using the vector embeddings
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100) 32 inputs, 100 neurons
    logits = h @ W2 + b2 # (32, 27) 32 inputs, 27 possible character outcomes
    loss = F.cross_entropy(logits, Y[ix])
    
    # Backward pass
    
    # Just making sure the gradients are initialized to zero
    for p in parameters:
        p.grad = None
    
    # This will compute all the gradients
    loss.backward()
    
    # parameter update
    for p in parameters:
        p.data += -lr * p.grad
        
# However now this is the loss for the minibatch ONLY! It's not the global loss
print(loss.item())