<a href="https://colab.research.google.com/github/luizvalle/Chessformer/blob/main/chessformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# @title Install dependencies {display-mode: "form"}
%%shell
pip install chess
pip install zstandard
pip install pandas
pip install numpy
pip install dask

Collecting chess
  Downloading chess-1.10.0-py3-none-any.whl (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━[0m [32m92.2/154.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: chess
Successfully installed chess-1.10.0
Collecting zstandard
  Downloading zstandard-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: zstandard
Successfully installed zstandard-0.21.0




In [22]:
# @title Imports {display-mode: "form"}
import chess.pgn
import zstandard as zstd
import requests
import io
import pandas as pd
import numpy as np
import dask.dataframe as dd
import re

from pprint import pprint
from multiprocessing import Pool

## Download and prepare the data

In [17]:
# @title Fetch the dowload links {display-mode: "form"}
DOWNLOAD_LIST = "https://database.lichess.org/standard/list.txt"
download_links = sorted(requests.get(DOWNLOAD_LIST).text.split('\n'), reverse=False)
print("Examples:")
pprint(download_links[0:5])

Examples:
['https://database.lichess.org/standard/lichess_db_standard_rated_2013-01.pgn.zst',
 'https://database.lichess.org/standard/lichess_db_standard_rated_2013-02.pgn.zst',
 'https://database.lichess.org/standard/lichess_db_standard_rated_2013-03.pgn.zst',
 'https://database.lichess.org/standard/lichess_db_standard_rated_2013-04.pgn.zst',
 'https://database.lichess.org/standard/lichess_db_standard_rated_2013-05.pgn.zst']


In [6]:
# @title Define the class to iterate through games {display-mode: "form"}
class CompressedPgnIterator:
  def __init__(self, download_link):
      dctx = zstd.ZstdDecompressor()
      # Stream the results so we do not load everything
      # into memory at once
      response = requests.get(url=download_link, stream=True)
      reader = dctx.stream_reader(response.raw)
      self.text_stream = io.TextIOWrapper(reader, encoding='utf-8')

  def __iter__(self):
      return self

  def __next__(self):
    game = chess.pgn.read_game(self.text_stream)
    if game:
      return game
    else:
      raise StopIteration

In [None]:
column_types = {
    "Event": "category",
    "Result": "category",
    "WhiteElo": "uint16",
    "BlackElo": "uint16",
    "TimeControl": "category",
    "Termination": "category"
}

def process_link(download_link):
  date = re.search(r"(\d{4}-\d{2}).pgn.zst", download_link).group(1)
  games = CompressedPgnIterator(download_link)
  games_info = [{
        "Event": game.headers["Event"],
        "Result": game.headers["Result"],
        "WhiteElo": int(game.headers["WhiteElo"]) if game.headers["WhiteElo"].isnumeric() else np.nan,
        "BlackElo": int(game.headers["BlackElo"]) if game.headers["BlackElo"].isnumeric() else np.nan,
        "TimeControl": game.headers["TimeControl"],
        "Termination": game.headers["Termination"]}
        for game in games]
  df = pd.DataFrame(data=games_info).astype(column_types)
  df.to_parquet(
      path=f"/content/drive/MyDrive/Trabalhos Escolares/Data Mining/Dataset/Data exploration/{date}.zstd",
      compression="zstd"
      )

with Pool(processes=5) as pool:
  pool.map(process_link, download_links)
print("Saved data")
