<a href="https://colab.research.google.com/github/mlabonne/chessllm/blob/main/Chess_LLM_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# @title # ♟️ Chess LLM Dataset

# !pip install -qqq zstandard datasets huggingface_hub tqdm --progress-bar off
!mkdir dataset

from datasets import Dataset


try:
    from google.colab import userdata
except ImportError:
    print("Not in colab")


from chessllm.datasets import *
from chessllm.game_samplers import *

url = "https://database.lichess.org/standard/lichess_db_standard_rated_2017-06.pgn.zst"  # @param {type:"string"}
SAMPLING = "curriculum"  # @param ["curriculum", "random"]
MIN_ELO = 1500  # @param {type:"integer"}
MAX_ELO = 3000  # @param {type:"integer"}
MAX_GAMES = 1000000  # @param {type:"integer"}
username = "mlabonne"  # @param {type:"string"}

compressed_file = "lichess_db_compressed.zst"
decompressed_file = "lichess_db.pgn"

# Download file
print("Downloading file...")
compressed_filename, downloaded = download_file(url)
if downloaded:
    print(f"Downloaded {compressed_filename}.")
else:
    print(f"{compressed_filename} already exists.")

# Decompress file
print("Decompressing file...")
decompressed_filename, decompressed = decompress_file(compressed_filename)
if decompressed:
    print(f"Decompressed into {decompressed_filename}.")
else:
    print(f"{decompressed_filename} already exists.")

# Parse games
print("Parsing games and creating DataFrame...")
games_df = parse_and_store_games(decompressed_file)

# Sample games
print("Sampling games...")
if SAMPLING == "random":
    sampled_df = random_sampling(games_df, MIN_ELO, MAX_ELO, MAX_GAMES)
if SAMPLING == "curriculum":
    sampled_df = curriculum_sampling(games_df, MIN_ELO, MAX_ELO, MAX_GAMES)

MAX_GAMES = len(sampled_df)
output_file = (
    f"dataset/lichess_{SAMPLING}_{format_games(MAX_GAMES)}_{MIN_ELO}-{MAX_ELO}.parquet"
)

# Save HF dataset
print("Save as HF Dataset...")
dataset = Dataset.from_pandas(sampled_df)
dataset = dataset.remove_columns(["__index_level_0__"])
dataset.to_parquet(output_file)

# Upload to HF
upload_to_hf(dataset, username, "chessllm")

sampled_df

mkdir: dataset: File exists
Not in colab
Downloading file...


KeyboardInterrupt: 