In [1]:
import chess
import chess.pgn
import chess.engine
import io
import os
import logging

import numpy as np
import pandas as pd

from math import ceil
from collections import defaultdict as dd
from IPython.display import SVG, display
from tqdm.auto import tqdm

In [2]:
### Basic structure

DATA_DIR = "data/"
STOCKFISH_DIR = 'stockfish/'
ARCHIVE_DIR = DATA_DIR + 'archives/'

###
##
###

### Stockfish

STOCKFISH_AVX512_TAR = "stockfish-ubuntu-x86-64-avx512.tar"
STOCKFISH_AVX512 = "stockfish-ubuntu-x86-64-avx512"
STOCKFISH_AVX512_EXE = STOCKFISH_DIR + STOCKFISH_AVX512


###
##
###

### URLs

ELITE_DATABASE_URL  = "https://database.nikonoel.fr/lichess_elite_2021-11.zip"
EVAL_DATABASE_URL   = "https://database.lichess.org/lichess_db_eval.jsonl.zst"
STOCKFISH_DOWNSTREAM = "https://github.com/official-stockfish/Stockfish/releases/latest/download/"
STOCKFISH_AVX512_URL = STOCKFISH_DOWNSTREAM + STOCKFISH_AVX512_TAR

### 
##
###

### Datasets 

ELITE_DATASET_ARCHIVE = "lichess_elite_2021-11.zip"
ELITE_DATASET_FILENAME = "lichess_elite_2021-11.pgn"


LICHESS_EVAL_ARCHIVE = "lichess_db_eval.jsonl.zst"
LICHESS_EVAL_FILENAME = "lichess_db_eval.jsonl"


In [3]:
logging.basicConfig(filename='test.log',
                    filemode='a',
                    format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
                    datefmt='%H:%M:%S',
                    level=logging.DEBUG)

In [4]:
![ ! -f $STOCKFISH_AVX512_TAR ] && wget $STOCKFISH_AVX512_URL
![ ! -f $STOCKFISH_AVX512_EXE ] && tar -xf $STOCKFISH_AVX512_TAR

In [5]:
class DatasetTool:

    def __init__(self, name):
        self.bitboards_dir_name = "bitboards"
        self.name = name
        self.root_path = f"{DATA_DIR}{self.name}/"
        self.bitboards_path = f"{self.root_path}{self.bitboards_dir_name}/"

    def prepare_dataset(self, url, archive_name):
        self.create_root_dirs()
        self.download_archive(url, archive_name)
        self.unpack_archive(archive_name)
    
    def create_root_dirs(self):
        try:
            os.mkdir(DATA_DIR + self.name)
            os.mkdir(self.bitboards_path)
            os.mkdir(DATA_DIR + "stats")
        except FileExistsError:
            print("The directory already exists")
            
    def unpack_archive(self, archive_name):
        archive_path = ARCHIVE_DIR + archive_name
        if not os.path.exists(archive_path):
            print("Archive does not exist.")
            return 
            
        filename, ext = os.path.splitext(archive_path)
        if len(os.listdir(self.root_path)) > 1:
            print("The archive was already unpacked.")
            return

        if ext == ".zst":
            !unzstd {archive_path} --output-dir-flat {self.root_path}
        elif ext == ".zip":
            !unzip {archive_path} -d {self.root_path}
        else:
            print("Unknown compression.")

    def download_archive(self, url, archive_name=None):
        if os.path.exists(ARCHIVE_DIR + archive_name):
            print("Archive already downloaded.")
            return

        !wget {url} -P {ARCHIVE_DIR}

    def get_bitboard_path(self, i):
        files = os.listdir(self.bitboards_path)
        result = list(filter(lambda x : x.startswith(i), files))
        if len(result) > 1:
            print("Ambiguous identifier specify the start of the filename more preciseley.")
        elif len(result) == 1:
            return f"{self.bitboards_path}{result[0]}"
        else:
            print("No files found.")        

    def create_bitboard_path(self, pos_limit, skip, depth_min, depth_max):
        return f"{self.bitboards_path}{pos_limit}_{skip}_{depth_min}_{depth_max}.csv"

    def dataset_file_name(self):
        files = os.listdir(self.root_path)

        file = list(filter(lambda x: not os.path.isdir(self.root_path + x), files))     
        return self.root_path + file[0]


In [6]:
elite_dataset_tool = DatasetTool("elite_dataset")
eval_dataset_tool = DatasetTool("eval_dataset")

In [47]:
# Create necessary data 
# eval_dataset_tool.prepare_dataset(EVAL_DATABASE_URL, LICHESS_EVAL_ARCHIVE)
# In this instance a lot of data can be discarded like the pvs lines
# !sed -i 's/"line":"[^"]*"/"line":""/g' {eval_dataset_tool.dataset_file_name()} # remove not needed lines

In [9]:
# Create necessary data 
# elite_dataset_tool.prepare_dataset(ELITE_DATABASE_URL, ELITE_DATASET_ARCHIVE)

In [7]:
class PGNProvider:
    
    def __init__(self, path : str, skip : int = 0):
        c = 0
        for line in open(path, "r"):
            if line == '\n':
                c += 1
        
        self.len = (c + 1) // 2
        self.read_count = 0
        self.pgn_file = open(path, "r")

        if skip >= self.len:
            raise Exception("Not enough games to skip")
        
        for _ in range(skip):
            while self.pgn_file.readline() != '\n': pass
            while self.pgn_file.readline() != '\n': pass
            self.len -= 1
    
    def __iter__(self):
        return self
    
    def __len__(self):
        return self.len
    
    def __next__(self):
        pgn_string = ""
        c = False

        while True:
            line = self.pgn_file.readline()
            
            if line == '':
                raise StopIteration
            
            if (c and line == '\n') or line == '':
                break
            elif line == '\n':
                c = True
            pgn_string += line   
        self.read_count += 1
        return pgn_string
    
    def __str__(self):
        return f"Read positions : {self.read_count} Limit : {self.limit} PGN file : {self.pgn_file}"

In [8]:
class DataGenerator:

    def __init__(self, dataset_tool : DatasetTool, engine_path : str, pos_limit : int, skip : int = 0, avoid_repetitions : bool = True, e : bool = False, extract_side=False):
        self.pgn_file = dataset_tool.dataset_file_name()
        self.pos_limit = pos_limit
        self.skip = skip
        self.avoid_repetitions = avoid_repetitions
        self.pgn_provider = PGNProvider(self.pgn_file, skip=skip)
        self.signatures = set()
        self.pos_total = 0
        self.pos_written = 0
        self.games = 0
        self.engine = chess.engine.SimpleEngine.popen_uci(engine_path)
        self.engine.configure({"Threads" : 12, "Hash" : 1024})
        self.dataset_tool = dataset_tool
        self.eval = e
        self.extract_side = extract_side

    def write(self):
        out_file = self.dataset_tool.create_bitboard_path(self.pos_limit, self.skip, depth_min=0, depth_max=0)
        print("Writing header...")
        self.write_header(out_file)
        print("Done")
        print("Writing boards...")
        with tqdm(total=self.pos_limit, position=0, leave=True, desc="Positions") as pbar:
            for game_str in self.pgn_provider:
                game = chess.pgn.read_game(io.StringIO(game_str))
                if self.extract_side:
                    pos_count = self.write_positions_from_game_and_side(out_file, game)
                else:
                    pos_count = self.write_positions_from_game(out_file, game)
                self.games += 1 if pos_count > 0 else 0
                pbar.update(pos_count)
                if self.pos_written >= self.pos_limit: break
        print("Done")
        print(f"Read {self.pos_total} positions total")
        print(f"Written {self.pos_written} positions")
        print(f"Games : {self.games}")

    def write_positions_from_game(self, out_file, game : chess.pgn.Game) -> int:
        pos_count = 0
        while game is not None:
            self.pos_total += 1
            pos_count += self.write_pos(out_file, game)
            if self.pos_written >= self.pos_limit: break
            game = game.next()
        return pos_count
        
    def write_positions_from_game_and_side(self, out_file, game : chess.pgn.Game) -> int:
        pos_count = 0
        ex = 1 if self.extract_side == game.headers.get('White') else 0 
        while game is not None:
            self.pos_total += 1
            if ex:
                pos_count += self.write_pos(out_file, game)
                if self.pos_written >= self.pos_limit: break
            game = game.next()
            ex ^= 1
        return pos_count

    
    def write_header(self, out_file : str) -> None:
        header = []
        for side in chess.COLORS:
            for piece in chess.PIECE_TYPES:
                curr_piece = chess.piece_symbol(piece)
                if side == chess.WHITE:
                    header.append(curr_piece.upper())
                else:
                    header.append(curr_piece)
        header += ["score", "draw", "fen"]
        with open(out_file, "w") as f:
            f.write(",".join(header) + "\n")

    def write_pos(self, out_file : str, game : chess.pgn.Game) -> int:
        content = []
        bitboards = self.pgn_to_bitboard(game)

        if self.avoid_repetitions:
            if bitboards in self.signatures:
                return 0
            self.signatures.add(bitboards)       
        
        for bitboard in bitboards:
            content.append(str(bitboard))
            
        fen = game.board().fen()
        if self.eval: 
            score = self.evaluate(game)
            draw = self.predicate(score)
            content += [str(score), str(draw), fen]
        else: 
            content += ['-1', '-1', fen]
            
        with open(out_file, "a") as f:
            f.write(",".join(content) + "\n")
        self.pos_written += 1
        return 1

    def evaluate(self, game):
        info = self.engine.analyse(game.board(), chess.engine.Limit(time=0))
        score = info['score']
        if score.is_mate():
            return 0
        return score.relative.score()

    def predicate(self, score):
        return 1 if -50 <= score <= 50 else 0
        
    
    def pgn_to_bitboard(self, game : chess.pgn.Game) -> tuple[int]:
        result = []
        for side in chess.COLORS:
            for piece in chess.PIECE_TYPES:
                result.append(int(game.board().pieces(piece, side)))
        return tuple(result)

data_generator = DataGenerator(elite_dataset_tool, STOCKFISH_AVX512_EXE, pos_limit=2000000)

In [8]:
data_generator.write()

Writing header...
Done
Writing boards...


Positions:   0%|          | 0/2000000 [00:00<?, ?it/s]

Done
Read 2357174 positions total
Written 2000000 positions
Games : 28727


In [76]:
games_dataset_tool = DatasetTool("games_dataset")

In [85]:
games_dataset_generator = DataGenerator(games_dataset_tool, STOCKFISH_AVX512_EXE, pos_limit=10, extract_side='morphebot_nn', avoid_repetitions=False)

In [86]:
games_dataset_generator.write()

Writing header...
Done
Writing boards...


Positions:   0%|          | 0/10 [00:00<?, ?it/s]

Done
Read 20 positions total
Written 10 positions
Games : 1


In [20]:
games_pgn = PGNProvider(games_dataset_tool.dataset_file_name())

In [53]:
# def extract_pvs(evals):
#     for pvs in evals:
#         if 19 <= pvs['depth'] <= 24:
#             return pvs['pvs'], pvs['depth']
#     return None, None

# reader = pd.read_json("data/eval_dataset/lichess_db_eval.jsonl", lines=True, chunksize=100000)
# stat = [0 for i in range(1000)]
# total = 0
# with tqdm(total=14000000, position=0, leave=True, desc="Positions") as pbar:
#     for chunk in reader:
#         ch_size = len(chunk)
#         for i in range(ch_size):
#             total += 1
#             game_info = chunk.iloc[i]
#             evals = game_info['evals']
#             pvs, depth = extract_pvs(evals)
#             if pvs == None: continue
        
#             stat[len(pvs)] += 1
                
#         pbar.update(ch_size)
# print(total)

In [51]:
class DataGeneratorFEN:

    def __init__(self, dataset_tool : DatasetTool, pos_limit : int, skip : int = 0, depth_min : int = 19, depth_max : int = 24, chunksize : int = 10000):
        self.fen_file = dataset_tool.dataset_file_name()
        self.pos_limit = pos_limit
        self.skip = skip
        self.pos_total = 0
        self.pos_written = 0
        self.reader = pd.read_json(self.fen_file, lines=True, chunksize=chunksize)
        self.header = ["white", "cK", "cQ", "ck", "cq", "draw", "score", "mate", "depth", "fen"]
        self.skipn = ceil(skip / chunksize)
        self.dataset_tool = dataset_tool
        self.depth_min = depth_min
        self.depth_max = depth_max
        self.logger = logging.getLogger('DataGeneratorFEN')

        for i in range(self.skipn):
            next(self.reader)

    def extract_pvs(self, evals):
        for pvs in evals:
            if self.depth_min <= pvs['depth'] <= self.depth_max:
                return pvs['pvs'], pvs['depth']
        return None, None


    def write(self):
        out_file = self.dataset_tool.create_bitboard_path(self.pos_limit, self.skip, self.depth_min, self.depth_max)
        print("Writing header...")
        self.write_header(out_file)
        print("Done")
        print("Writing boards...")
        with tqdm(total=self.pos_limit, position=0, leave=True, desc="Positions") as pbar:
            for chunk in self.reader:
                written_per_chunk = 0
                for i in range(len(chunk)):
                    self.pos_total += 1
                    game_info = chunk.iloc[i]
                    evals = game_info['evals']
                    pvs, depth = self.extract_pvs(evals)
                    if pvs == None: continue

                    fen = game_info['fen']
                    try:
                        board = chess.Board(fen)
                    except Exception as e:
                        self.logger.warning("When parsing fen: %s with error: %s", fen, repr(e))
                        continue
                        
                    mate = 0
                    if "cp" in pvs[0]:
                        score = pvs[0]['cp']
                    else:
                        score = pvs[0]['mate']
                        mate = 1
                    self.write_pos(out_file, board, score, mate, depth)
                    self.pos_written += 1
                    written_per_chunk += 1
                    if self.pos_written >= self.pos_limit: break
                pbar.update(written_per_chunk)
                if self.pos_written >= self.pos_limit: break
                
        print("Done")
        print(f"Read {self.pos_total} positions total")
        print(f"Written {self.pos_written} positions")

    def write_header(self, out_file : str) -> None:
        header = []
        for side in chess.COLORS:
            for piece in chess.PIECE_TYPES:
                curr_piece = chess.piece_symbol(piece)
                if side == chess.WHITE:
                    header.append(curr_piece.upper())
                else:
                    header.append(curr_piece)
        header += self.header
        with open(out_file, "w") as f:
            f.write(";".join(header) + "\n")

    def write_pos(self, out_file : str, board : chess.Board, score : int, mate : int, depth : int) -> int:
        content = []
        bitboards = self.board_to_bitboard(board)
        fen = board.fen()
        draw = str(1) if -50 <= score <= 50 else str(0)
        score = str(score)
        mate = str(mate)
        turn = str(int(board.turn))
        depth = str(depth)
        castling_K = str(int(board.has_kingside_castling_rights(chess.WHITE)))
        castling_Q = str(int(board.has_queenside_castling_rights(chess.WHITE)))
        castling_k = str(int(board.has_kingside_castling_rights(chess.BLACK)))
        castling_q = str(int(board.has_queenside_castling_rights(chess.BLACK)))
        
        for bitboard in bitboards:
            content.append(str(bitboard))

        content += [turn, castling_K, castling_Q, castling_k, castling_q, draw, score, mate, depth, fen]
        with open(out_file, "a") as f:
            f.write(";".join(content) + "\n")
        return 1
    
    def board_to_bitboard(self, board : chess.Board) -> tuple[int]:
        result = []
        for side in chess.COLORS:
            for piece in chess.PIECE_TYPES:
                result.append(int(board.pieces(piece, side)))
        return tuple(result)

In [52]:
d = DataGeneratorFEN(eval_dataset_tool, 6882157, skip=0, chunksize=100000)
d.write()

Writing header...
Done
Writing boards...


Positions:   0%|          | 0/6882157 [00:00<?, ?it/s]

Done
Read 13869526 positions total
Written 6882157 positions


In [20]:
d = DataGeneratorFEN(eval_dataset_tool, 14000000, skip=0, chunksize=100000)
d.write()

Writing header...
Done
Writing boards...


Positions:   0%|          | 0/14000000 [00:00<?, ?it/s]

  self.logger.warn("When parsing fen: %s with error: %s", fen, repr(e))
  self.logger.warn("When parsing fen: %s with error: %s", fen, repr(e))


Done
Read 13123860 positions total
Written 6882157 positions


In [54]:
d = DataGeneratorFEN(eval_dataset_tool, 1000000, skip=0, chunksize=100000)
d.write()

Writing header...
Done
Writing boards...


Positions:   0%|          | 0/1000000 [00:00<?, ?it/s]

Done
Read 1965254 positions total
Written 1000000 positions


In [55]:
d = DataGeneratorFEN(eval_dataset_tool, 100000, skip=1509579)
d.write()

Writing header...
Done
Writing boards...


Positions:   0%|          | 0/100000 [00:00<?, ?it/s]

Done
Read 190105 positions total
Written 100000 positions


In [56]:
d = DataGeneratorFEN(eval_dataset_tool, 100000, skip=1509579+136485)
d.write()

Writing header...
Done
Writing boards...


Positions:   0%|          | 0/100000 [00:00<?, ?it/s]

Done
Read 207150 positions total
Written 100000 positions


In [57]:
d = DataGeneratorFEN(eval_dataset_tool, 250000, skip=1509579+136485+161371)
d.write()

Writing header...
Done
Writing boards...


Positions:   0%|          | 0/250000 [00:00<?, ?it/s]

Done
Read 514419 positions total
Written 250000 positions


In [58]:
d = DataGeneratorFEN(eval_dataset_tool, 250000, skip=1509579+136485+161371+371302)
d.write()

Writing header...
Done
Writing boards...


Positions:   0%|          | 0/250000 [00:00<?, ?it/s]

Done
Read 485300 positions total
Written 250000 positions


In [89]:
engine = chess.engine.SimpleEngine.popen_uci(STOCKFISH_AVX512_EXE)
engine.configure({"Threads" : 16, "Hash" : 16384})
def analyze_position(fen, depth=20):
    board = chess.Board(fen)
    info = engine.analyse(board, chess.engine.Limit(time=0.1))
    score = info['score']
    print(score.pov(chess.WHITE))
    return info

In [112]:
analyze_position("rnbqkbnr/pppp1ppp/8/4p3/4P3/8/PPPP1PPP/RNBQKBNR w KQkq - 0 2", depth=16)

+27


{'string': 'NNUE evaluation using nn-b1a57edbea57.nnue',
 'depth': 20,
 'seldepth': 40,
 'multipv': 1,
 'score': PovScore(Cp(+27), WHITE),
 'nodes': 731915,
 'nps': 7246683,
 'hashfull': 0,
 'tbhits': 0,
 'time': 0.101,
 'pv': [Move.from_uci('g1f3'),
  Move.from_uci('b8c6'),
  Move.from_uci('f1b5'),
  Move.from_uci('g8f6'),
  Move.from_uci('e1g1'),
  Move.from_uci('f6e4'),
  Move.from_uci('f1e1'),
  Move.from_uci('e4d6'),
  Move.from_uci('f3e5'),
  Move.from_uci('f8e7'),
  Move.from_uci('b5f1'),
  Move.from_uci('c6e5'),
  Move.from_uci('e1e5'),
  Move.from_uci('e8g8'),
  Move.from_uci('d2d4'),
  Move.from_uci('e7f6'),
  Move.from_uci('e5e1'),
  Move.from_uci('f8e8'),
  Move.from_uci('c1f4'),
  Move.from_uci('e8e1'),
  Move.from_uci('d1e1'),
  Move.from_uci('d6e8'),
  Move.from_uci('e1e3'),
  Move.from_uci('d7d5'),
  Move.from_uci('b1d2'),
  Move.from_uci('c8f5'),
  Move.from_uci('c2c3'),
  Move.from_uci('a7a5'),
  Move.from_uci('d2b3'),
  Move.from_uci('c7c6'),
  Move.from_uci('a1e1')]

In [112]:
def count_eval_depths():
    depths = dd(lambda: 0)
    def count(datapoint):
        for i in datapoint:
            depths[i['depth']] += 1
    
    reader = pd.read_json(eval_dataset_tool.dataset_file_name(), lines=True, chunksize=100000)
    datapoints = 0

    with tqdm(total=13123859, position=0, leave=True, desc="Positions") as pbar:
        for chunk in reader:
            chunk_len = len(chunk)
            datapoints += chunk_len
            chunk['evals'].apply(count)
            pbar.update(chunk_len)
    return depths


In [18]:
dep = count_eval_depths()

Positions:   0%|          | 0/13123859 [00:00<?, ?it/s]

In [116]:
sorted(dep.items(), key=lambda x : x[1], reverse=True)

[(22, 1539075),
 (20, 1519346),
 (21, 1452153),
 (23, 1378441),
 (24, 1195223),
 (19, 1028930),
 (25, 985546),
 (18, 917756),
 (26, 774430),
 (17, 712042),
 (27, 610112),
 (99, 562977),
 (28, 492368),
 (16, 453104),
 (29, 383310),
 (30, 314644),
 (31, 257861),
 (15, 234675),
 (32, 218003),
 (33, 187108),
 (34, 163306),
 (35, 146212),
 (36, 133017),
 (37, 121927),
 (1, 115115),
 (38, 113162),
 (39, 104768),
 (40, 100415),
 (14, 96592),
 (41, 92100),
 (42, 85671),
 (43, 79643),
 (44, 73255),
 (45, 68478),
 (46, 63503),
 (47, 58382),
 (48, 53843),
 (49, 50059),
 (50, 48028),
 (245, 44738),
 (51, 43638),
 (52, 40514),
 (53, 37667),
 (13, 35400),
 (54, 34748),
 (55, 32506),
 (56, 30137),
 (57, 28002),
 (58, 25809),
 (59, 24287),
 (60, 22815),
 (61, 21107),
 (62, 19488),
 (63, 18268),
 (64, 16797),
 (65, 15804),
 (66, 14848),
 (67, 13805),
 (68, 13039),
 (69, 11793),
 (70, 11396),
 (71, 10377),
 (12, 10299),
 (72, 9729),
 (73, 9070),
 (74, 8668),
 (75, 8031),
 (76, 7522),
 (77, 7181),
 (78, 

In [19]:
bigdata = pd.read_csv("data/eval_dataset/bitboards/6882157_0_19_24.csv", sep=";")

In [21]:
bigdata['draw'].mean()

0.38863062844977236