In [10]:
import os
import math
from datetime import datetime
from glob import glob
from tqdm import tqdm
import numpy as np
import torch
from torch.utils.tensorboard import SummaryWriter

from lib.service import SamplesService
from lib.model import NnueModel
from lib.model import decode_int64_bitset
from lib.serialize import NnueWriter
from lib.puzzles import PuzzleAccuracy

In [11]:
SCALING = 356.0

class PQRLoss(torch.nn.Module):
    def __init__(self):
        super(PQRLoss, self).__init__()

    def forward(self, output, _target):
        output = output.reshape(-1, 3)
        
        p = output[:,0] / SCALING
        q = output[:,1] / SCALING
        r = output[:,2] / SCALING
        
        a = -torch.mean(torch.log(torch.sigmoid(r - q)))
        b = torch.mean(torch.square(p + q))

        loss = a + b

        return loss

class EvalLoss(torch.nn.Module):
    def __init__(self):
        super(EvalLoss, self).__init__()

    def forward(self, output, target):

        # go from UCI cp to Stockfish's internal engine units
        # https://github.com/official-stockfish/Stockfish/blob/fb07281f5590bc216ecbacd468aa0d06fdead70c/src/uci.cpp#L341
        target = target * SCALING / 100.0

        # targets are in CP-space change it to WDL-space [0, 1]
        wdl_model = torch.sigmoid(output / SCALING)
        wdl_target = torch.sigmoid(target / SCALING)

        loss = torch.pow(torch.abs(wdl_model - wdl_target), 2.5)

        return loss.mean()

In [12]:
EPOCHS = 100000
BATCHES_PER_EPOCH = 1000
BATCH_SIZE = 4096

FEATURE_SET = "basic"
NUM_FEATURES = 768
METHOD = "eval"

if METHOD == "pqr":
    X_SHAPE = (BATCH_SIZE, 3, 2, NUM_FEATURES // 64)
    Y_SHAPE = (BATCH_SIZE, 0)
    INPUTS = glob("/mnt/d/datasets/pqr-1700/*.csv")
    loss_fn = PQRLoss()
elif METHOD == "eval":
    X_SHAPE = (BATCH_SIZE, 2, NUM_FEATURES // 64)
    Y_SHAPE = (BATCH_SIZE, 1)
    INPUTS = glob("/mnt/d/datasets/eval/pepe.csv")
    loss_fn = EvalLoss()

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
folder = f'runs/{timestamp}_{METHOD}_{FEATURE_SET}_{BATCH_SIZE}'
os.makedirs(f'{folder}/models', exist_ok=True)

puzzles = PuzzleAccuracy('/mnt/c/Users/mlomb/Desktop/Tesis/cs-master-thesis/puzzles.csv')
samples_service = SamplesService(x_shape=X_SHAPE, y_shape=Y_SHAPE, inputs=INPUTS, feature_set=FEATURE_SET, method=METHOD)
chessmodel = NnueModel(num_features=NUM_FEATURES)
chessmodel.cuda()

#for i in tqdm(range(1000000)):
#    a = samples_service.next_batch()

optimizer = torch.optim.Adam(chessmodel.parameters(), lr=0.0015)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', threshold=0.0001, factor=0.7, patience=10)
writer = SummaryWriter(folder)

# @torch.compile # 30% speedup
def train_step(X, y):
    # Clear the gradients
    optimizer.zero_grad()

    # Forward pass
    outputs = chessmodel(X)

    # Compute the loss
    loss = loss_fn(outputs, y)
    loss.backward()

    # Update the parameters
    optimizer.step()

    chessmodel.clip_weights()

    return loss

# Make sure gradient tracking is on
chessmodel.train()

for epoch in range(EPOCHS):
    avg_loss = 0.0

    for _ in tqdm(range(BATCHES_PER_EPOCH), desc=f'Epoch {epoch}'):
        X, y = samples_service.next_batch()
    
        # expand bitset
        X = decode_int64_bitset(X)
        X = X.reshape(-1, 2, NUM_FEATURES)

        loss = train_step(X, y)
        avg_loss += loss.item()

        if math.isnan(avg_loss):
            raise Exception("Loss is NaN, exiting")

    avg_loss /= BATCHES_PER_EPOCH

    # Step the scheduler
    scheduler.step(avg_loss)

    # save model
    model_path = f'{folder}/models/{epoch}'
    model_pth = f'{model_path}.pth'
    model_nn = f'{model_path}.nn'
    torch.save(chessmodel.state_dict(), model_pth)
    nn_writer = NnueWriter(chessmodel, FEATURE_SET)
    with open(model_nn, "wb") as f:
        f.write(nn_writer.buf)
    
    # run metrics
    puzzles_res = puzzles.measure(["/mnt/c/Users/mlomb/Desktop/Tesis/cs-master-thesis/bot/target/release/bot", f"--nn={model_nn}"])

    # log to tensorboard
    writer.add_scalar('Train/loss', avg_loss, epoch)
    writer.add_scalar('Train/lr', scheduler._last_lr[0], epoch) # get_last_lr()
    writer.add_scalar('Params/mean-f1', torch.mean(chessmodel.ft.weight), epoch)
    writer.add_scalar('Params/mean-l1', torch.mean(chessmodel.linear1.weight), epoch)
    writer.add_scalar('Params/mean-l2', torch.mean(chessmodel.linear2.weight), epoch)
    writer.add_scalar('Params/mean-out', torch.mean(chessmodel.output.weight), epoch)
    for name, param in chessmodel.named_parameters():
        writer.add_histogram(name, param, epoch)
    writer.flush()
    for rating_min, rating_max, accuracy in puzzles_res:
        writer.add_scalar(f'Puzzles/{rating_min}-{rating_max}/accuracy', accuracy, epoch)


Epoch 0:   0%|          | 0/1000 [00:00<?, ?it/s]Reading samples from /mnt/d/datasets/eval/pepe.csv
fen: [50, 114, 50, 114, 107, 49, 47, 112, 50, 110, 113, 112, 50, 47, 49, 112, 49, 112, 49, 112, 49, 66, 47, 49, 98, 112, 53, 47, 51, 78, 52, 47, 56, 47, 80, 80, 80, 75, 49, 80, 80, 80, 47, 82, 50, 81, 51, 82, 32, 98, 32, 45, 32, 45, 32, 49, 32, 49, 55]
score: [45, 51, 49, 57]
bestmove: [100, 55, 101, 53, 13, 10]
fen: 2r2rk1/p2nqp2/1p1p1p1B/1bp5/3N4/8/PPPK1PPP/R2Q3R b - - 1 17
fen: [56, 47, 112, 53, 82, 112, 47, 56, 47, 52, 107, 51, 47, 56, 47, 52, 80, 50, 80, 47, 80, 49, 80, 53, 47, 50, 75, 53, 32, 119, 32, 45, 32, 45, 32, 49, 32, 51, 49]
score: [53, 57, 53]
bestmove: [103, 55, 97, 55, 13, 10]
fen: 8/p5Rp/8/4k3/8/4P2P/P1P5/2K5 w - - 1 31
fen: []
score: []
bestmove: []
thread 'main' panicked at src/method/eval.rs:93:61:
called `Result::unwrap()` on an `Err` value: InvalidFen
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
Epoch 0:   0%|          | 0/1000 [00:

BrokenPipeError: [Errno 32] Broken pipe