In [15]:
import chess.pgn
from IPython.display import clear_output
import math
from typing import NamedTuple, Tuple
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
import chess
import chess.engine
from tqdm import tqdm 
import csv
import requests
import gc
import requests
import numpy as np
from concurrent.futures import ThreadPoolExecutor
import time
import pickle

Download the PGN Database with 4,471,785 games. 

This can be done in the terminal with:

!wget https://l--l.top/ajotb-pgn-000

!7z x /content/ajotb-pgn-000

Extract the moves from the PGN file. This data set has limited other data (elo, openings, etc) so we will stick to moves.

In [None]:
pgn_file_path = "/content/AJ-OTB-PGN-000.pgn"
games = []
#black_elo = []
#white_elo = []
with open(pgn_file_path) as pgn_file:
    game_number = 1
    while True:
        game = chess.pgn.read_game(pgn_file)
        if game is None:
            break
        moves = [str(m) for m in game.mainline_moves()]
        if len(moves) >0:
            games.append(moves)
            #black_elo.append(game.headers.get("BlackElo", "N/A"))
            #white_elo.append(game.headers.get("WhiteElo", "N/A"))
            game_number += 1
            if game_number % 10000 == 0:
                clear_output()
                print(f"Processed {game_number} games")

Now we have a list of game moves which we need to turn into a list of positions and the moves that humans made from those positions.

In [None]:
move_dict = {}

for game in games:
    fens = []
    moves = []
    works = True
    board = chess.Board()
    for move in game:

        uci_move = chess.Move.from_uci(move)
        #If the move is legal (some games have illegal moves so we will discard those)
        if uci_move in board.legal_moves:
            #Save the FEN of the current position
            fens.append(board.fen())

            #Save the human move from that position
            moves.append(move)

            #Update the board
            board.push(uci_move)
        else:
            #If the move is illegal, disregard this game
            works = False
            break

    if works:
        for fen, move in zip(fens, moves):
            #If this is a new (position, move) pair, add it to the dictionary
            if fen not in move_dict:
                move_dict[fen] = [move]
            else:
                if move not in move_dict[fen]:
                    move_dict[fen].append(move)

Save this dictionary in chunks of 1,000,000 positions.

In [None]:
import pickle

chunk = {}
chunk_size = 1000000
index = 0
for key, value in tqdm(move_dict.items()):
    chunk[key] = value
    if len(chunk) >= chunk_size:
        with open(f'chess_data/move_dict_{index:03d}.pkl', 'wb') as f:
            pickle.dump(chunk, f)
        chunk = {}
        index += 1
if chunk:
    with open(f'chess_data/move_dict_{index:03d}.pkl', 'wb') as f:
        pickle.dump(chunk, f)