In [1]:
import os.path
import numpy as np
import time
import torch
import sys
from torch import nn
from torch.utils.data import DataLoader

from bbnn_dataset import BBNNDataset

In [2]:
# Why do we need to scale the network score prediction? It seems that
# this is necessary in order to keep weigths & biases small,
# so we chose a scale of 1000 - see PRED_SCALE
#
# For the loss function we compare the sigmoid of the scores (pred vs target)
# in order to focus more on smaller absolute scores
# Then we need to stretch the sigmoid by the centipawn score, like:
# for how big a score are we almost winning (sigmoid approaches to 1)?
# Now: sigmoid(4) = 0.982
# We want to have that win probability for a score of 600 cp
# Then the stretch factor must be 1 / 150
PRED_SCALE = 1000.0
SCORE_SIGMOID_SCALE = 1.0 / 150.0

# For the model:
NUM_INPUTS = 384
L1 = 32
L2 = 128

In [3]:
# Define model - only test
class BBNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.input  = nn.Linear(NUM_INPUTS * 2, L1)
        self.output = nn.Linear(L1, 1, bias=False)

    def forward(self, x):
        w  = self.input(x)
        l0 = torch.clamp(w, 0.0, 1.0)
        y  = self.output(l0)
        return y * PRED_SCALE

In [4]:
# Calculate loss based on sigmoid of the scores
# Add a small fraction of the absolute difference of the scores
# CPFRAC = 0.0001

def loss_fn(pred, y, batch_no = 0):
    #score, outcome = y

    wdl_eval_model  = (pred * SCORE_SIGMOID_SCALE).sigmoid()
    wdl_eval_target = (y    * SCORE_SIGMOID_SCALE).sigmoid()

    # mloss = (torch.abs(wdl_eval_target - wdl_eval_model).square() + torch.abs(pred - y) * CPFRAC).mean()
    mloss = torch.abs(wdl_eval_target - wdl_eval_model).square().mean()
    return mloss

In [5]:
def train(device, dataloader, model, loss_fn, optimizer, train_pos):
    print(f'Train on {device} with {train_pos} positions')
    start = time.time()
    train_inst = 0
    train_loss = 0
    batch_report = None
    model.train()
    batch_no = 0
    for X, y in dataloader:
        batch_no += 1
        n = X.shape[0]
        train_inst += n
        if batch_report is None:
            batch_report = int(200000 / n)
        X, y = X.to(device), y.to(device)
        # print(f'Batch {batch_no}: {X} -> {y}')

        optimizer.zero_grad()
        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y, batch_no)

        # Backpropagation
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * n

        if batch_no % batch_report == 0:
            tdiff = time.time() - start
            ips = round(train_inst / tdiff)
            nows = time.strftime('%X %x')
            mloss = train_loss / train_inst
            print(f"loss: {mloss:>7f} [{train_inst:>7d}/{train_pos:>7d}] {nows}: {ips:>6d} samples/second")

    nows = time.strftime('%X %x')
    mloss = train_loss / train_inst
    print(f"Epoch loss: {mloss:>7f} [{train_inst:>7d}/{train_inst:>7d}] {nows}")
    return train_inst, mloss

In [6]:
def test(device, dataloader, model, loss_fn):
    model.eval()
    test_inst = 0
    test_loss = 0
    with torch.no_grad():
        batch_no = 0
        for X, y in dataloader:
            batch_no += 1
            n = X.shape[0]
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item() * n
            test_inst += n

    test_loss /= test_inst
    print(f"Test Error Avg loss: {test_loss:>8f} \n")
    return test_loss

In [7]:
def evaluate(device, dataloader, model, num):
    model.eval()
    eval_inst = 0
    with torch.no_grad():
        batch_no = 0
        for X in dataloader:
            batch_no += 1
            n = X.shape[0]
            X = X.to(device)
            pred = model(X)
            print(f'Eval instances {eval_inst + 1} to {eval_inst + n}: {pred}')
            eval_inst += n
            if eval_inst >= num:
                return

In [8]:
train_dir = r'C:\data\extract\2025\beenine\train'
test_dir = r'C:\data\extract\2025\beenine\test'
batch_size = 256
lr = 0.01
weight_decay = 0.0

device = ('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device is {device}')

Device is cuda


In [9]:
model = BBNN()
model = model.to(device)
print(model)

BBNN(
  (input): Linear(in_features=768, out_features=32, bias=True)
  (output): Linear(in_features=32, out_features=1, bias=False)
)


In [10]:
# optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
optimizer = torch.optim.SGD(model.parameters(), lr=lr)

In [11]:
optimizer

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 0.01
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)

In [12]:
# Training data
training_data = BBNNDataset(train_dir)

# Test data
test_data = BBNNDataset(test_dir)

In [13]:
# Create data loaders.
train_dataloader = DataLoader(
        training_data,
        batch_size=batch_size,
        pin_memory=False
    )
test_dataloader = DataLoader(
        test_data,
        batch_size=batch_size,
        pin_memory=False
    )

In [15]:
test_loss = test(device, test_dataloader, model, loss_fn)
print(f'Initial test loss: {test_loss}')

Open feature file C:\data\extract\2025\beenine\test\xaa-feat.txt
Open target  file C:\data\extract\2025\beenine\test\xaa-targ.txt
Test Error Avg loss: 0.089573 

Initial test loss: 0.08957291672730446


In [16]:
epochs = 3
save_prefix = 'save-sgd'

train_losses = []
test_losses = []

train_pos = 0
start = time.time()

for t in range(epochs):
    print(f"Epoch {t+1} from {epochs}\n-------------------------------")
    train_pos, train_loss = train(device, train_dataloader, model, loss_fn, optimizer, train_pos)
    train_losses.append(train_loss)
    save_name = f"{save_prefix}-{t}.pth"
    torch.save(model.state_dict(), save_name)
    print(f"Saved PyTorch Model State to {save_name}")
    test_loss = test(device, test_dataloader, model, loss_fn)
    test_losses.append(test_loss)

    tdiff = time.time() - start
    spe = tdiff / (t + 1)
    rem = round((epochs - t - 1) * spe)
    if t + 1 < epochs:
        spe = round(spe)
        print(f"{spe} seconds per epoch - {rem} seconds remaining\n-------------------------------")
print('Done')

Epoch 1 from 3
-------------------------------
Train on cuda with 0 positions
Open feature file C:\data\extract\2025\beenine\train\xab-feat.txt
Open target  file C:\data\extract\2025\beenine\train\xab-targ.txt
loss: 0.084377 [ 199936/      0] 12:43:21 02/08/25:  55988 samples/second
loss: 0.092933 [ 399872/      0] 12:43:24 02/08/25:  55766 samples/second
loss: 0.092004 [ 599808/      0] 12:43:28 02/08/25:  56136 samples/second
loss: 0.091971 [ 799744/      0] 12:43:31 02/08/25:  56417 samples/second
loss: 0.088257 [ 999680/      0] 12:43:35 02/08/25:  56620 samples/second
Open feature file C:\data\extract\2025\beenine\train\xac-feat.txt
Open target  file C:\data\extract\2025\beenine\train\xac-targ.txt
loss: 0.089984 [1199616/      0] 12:43:38 02/08/25:  56520 samples/second
loss: 0.089425 [1399552/      0] 12:43:42 02/08/25:  56590 samples/second
loss: 0.090557 [1599488/      0] 12:43:45 02/08/25:  56693 samples/second
loss: 0.092122 [1799424/      0] 12:43:49 02/08/25:  56854 samples

In [17]:
np.array(train_losses) / train_losses[0]

array([1.        , 0.99851959, 0.99833253])

In [18]:
np.array(test_losses) / test_losses[0]

array([1.        , 0.99978238, 0.99968236])

In [19]:
optimizer.param_groups

[{'params': [Parameter containing:
   tensor([[ 0.0174, -0.0100, -0.0172,  ...,  0.0074,  0.0218,  0.0305],
           [-0.0275,  0.0294,  0.0093,  ..., -0.0183,  0.0007, -0.0233],
           [ 0.0324,  0.0100, -0.0281,  ...,  0.0318, -0.0152,  0.0088],
           ...,
           [-0.0116, -0.0066, -0.0237,  ...,  0.0131, -0.0314, -0.0262],
           [-0.0125, -0.0133, -0.0318,  ..., -0.0158,  0.0224,  0.0326],
           [ 0.0181,  0.0057, -0.0040,  ...,  0.0045, -0.0048,  0.0069]],
          device='cuda:0', requires_grad=True),
   Parameter containing:
   tensor([-0.0133, -0.0363, -0.0033, -0.0868, -0.0398, -0.0707, -0.0663, -0.0546,
           -0.0270, -0.0715, -0.0158, -0.0541, -0.0571, -0.0424,  0.0350, -0.0542,
           -0.0053, -0.0452, -0.0182, -0.0236, -0.0102, -0.0469,  0.0252,  0.0782,
           -0.0688, -0.0534,  0.0032, -0.0233, -0.0799, -0.0285, -0.0124,  0.0183],
          device='cuda:0', requires_grad=True),
   Parameter containing:
   tensor([[ 0.0072,  0.0309,  

In [20]:
optimizer.param_groups[0]['params'][0]

Parameter containing:
tensor([[ 0.0174, -0.0100, -0.0172,  ...,  0.0074,  0.0218,  0.0305],
        [-0.0275,  0.0294,  0.0093,  ..., -0.0183,  0.0007, -0.0233],
        [ 0.0324,  0.0100, -0.0281,  ...,  0.0318, -0.0152,  0.0088],
        ...,
        [-0.0116, -0.0066, -0.0237,  ...,  0.0131, -0.0314, -0.0262],
        [-0.0125, -0.0133, -0.0318,  ..., -0.0158,  0.0224,  0.0326],
        [ 0.0181,  0.0057, -0.0040,  ...,  0.0045, -0.0048,  0.0069]],
       device='cuda:0', requires_grad=True)