<a href="https://colab.research.google.com/github/luizvalle/Chessformer/blob/main/chessformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [72]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [73]:
# @title Install dependencies {display-mode: "form"}
%%shell
pip install chess
pip install zstandard
pip install pandas
pip install numpy
pip install parallelbar





In [79]:
# @title Imports {display-mode: "form"}
import chess.pgn
import zstandard as zstd
import requests
import io
import pandas as pd
import numpy as np
import re
import multiprocessing
import os

from pprint import pprint
from parallelbar import progress_imapu

## Download and prepare the data

In [75]:
# @title Fetch the dowload links {display-mode: "form"}
DOWNLOAD_LIST = "https://database.lichess.org/standard/list.txt"
download_links = sorted(requests.get(DOWNLOAD_LIST).text.split('\n'), reverse=False)
print("Examples:")
pprint(download_links[0:5])

Examples:
['https://database.lichess.org/standard/lichess_db_standard_rated_2013-01.pgn.zst',
 'https://database.lichess.org/standard/lichess_db_standard_rated_2013-02.pgn.zst',
 'https://database.lichess.org/standard/lichess_db_standard_rated_2013-03.pgn.zst',
 'https://database.lichess.org/standard/lichess_db_standard_rated_2013-04.pgn.zst',
 'https://database.lichess.org/standard/lichess_db_standard_rated_2013-05.pgn.zst']


In [77]:
# @title Define the class to iterate through headers {display-mode: "form"}
class CompressedPgnHeaderIterator:
  def __init__(self, download_link):
      dctx = zstd.ZstdDecompressor()
      # Stream the results so we do not load everything
      # into memory at once
      response = requests.get(url=download_link, stream=True)
      reader = dctx.stream_reader(response.raw)
      self.text_stream = io.TextIOWrapper(reader, encoding='utf-8')

  def __iter__(self):
      return self

  def __next__(self):
    header = chess.pgn.read_headers(self.text_stream)
    if header:
      return header
    else:
      raise StopIteration

In [None]:
# @title Prepare chess games metadata for exploration {display-mode: "form"}
SAVE_DIR = "/content/drive/MyDrive/Trabalhos Escolares/Data Mining/Dataset/Data exploration"
column_types = {
    "Event": "category",
    "Result": "category",
    "WhiteElo": "uint16",
    "BlackElo": "uint16",
    "TimeControl": "category",
    "Termination": "category"
}

def path_from_link(download_link):
  date = re.search(r"(\d{4}-\d{2}).pgn.zst", download_link).group(1)
  path = f"{SAVE_DIR}/{date}.parquet.zstd"
  return path

def process_headers(download_link, overwrite=False):
  path = path_from_link(download_link)
  if not overwrite and os.path.isfile(path):
    return
  headers = CompressedPgnHeaderIterator(download_link)
  games_info = [{
        "Event": header["Event"],
        "Result": header["Result"],
        "WhiteElo": int(header["WhiteElo"]) if header["WhiteElo"].isnumeric() else 0,
        "BlackElo": int(header["BlackElo"]) if header["BlackElo"].isnumeric() else 0,
        "TimeControl": header["TimeControl"],
        "Termination": header["Termination"]}
        for header in headers]
  df = pd.DataFrame(data=games_info).astype(column_types)
  df.to_parquet(
      path=f"/content/drive/MyDrive/Trabalhos Escolares/Data Mining/Dataset/Data exploration/{date}.parquet.zstd",
      compression="zstd"
      )
  print(f"Saved '{path}'")

unprocessed_links = list()
for download_link in download_links:
  path = path_from_link(download_link)
  if not os.path.isfile(path):
    unprocessed_links.append(download_link)

progress_imapu(func=process_headers, tasks=unprocessed_links, chunk_size=10)

DONE:   0%|          | 0/81 [00:00<?, ?it/s]

In [76]:
# @title Define the class to iterate through games {display-mode: "form"}
class CompressedPgnGameIterator:
  def __init__(self, download_link):
      dctx = zstd.ZstdDecompressor()
      # Stream the results so we do not load everything
      # into memory at once
      response = requests.get(url=download_link, stream=True)
      reader = dctx.stream_reader(response.raw)
      self.text_stream = io.TextIOWrapper(reader, encoding='utf-8')

  def __iter__(self):
      return self

  def __next__(self):
    game = chess.pgn.read_game(self.text_stream)
    if game:
      return game
    else:
      raise StopIteration