<a href="https://colab.research.google.com/github/mlabonne/chessllm/blob/main/Chess_LLM_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# @title # ♟️ Chess LLM Dataset

!pip install -qqq zstandard datasets huggingface_hub tqdm --progress-bar off
!mkdir dataset

from tqdm.auto import tqdm
from datasets import Dataset
from huggingface_hub import create_repo, HfApi
from google.colab import userdata
import requests
import zstandard as zstd
import pandas as pd
import datasets
import re

def download_file(url, filename):
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(filename, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        return True
    return False

def decompress_file(compressed_file, decompressed_file):
    with open(compressed_file, 'rb') as comp_file:
        dctx = zstd.ZstdDecompressor()
        with open(decompressed_file, 'wb') as decomp_file:
            dctx.copy_stream(comp_file, decomp_file)

def parse_and_store_games(file_path):
    total_lines = sum(1 for _ in open(file_path, 'r'))
    games_data = []
    current_game = []
    elo_regex = re.compile(r"\[WhiteElo \"(\d+)\"\]|\[BlackElo \"(\d+)\"\]")

    with open(file_path, 'r') as file:
        progress_bar = tqdm(total=total_lines, desc="Parsing Games")
        for line in file:
            stripped_line = line.strip()
            current_game.append(stripped_line)
            if stripped_line == "":
                game_content = "\n".join(current_game)
                elos = elo_regex.findall(game_content)

                white_elo, black_elo = 0, 0
                if elos:
                    for elo_pair in elos:
                        white_elo = int(elo_pair[0]) if elo_pair[0] else white_elo
                        black_elo = int(elo_pair[1]) if elo_pair[1] else black_elo

                if white_elo and black_elo:
                    avg_elo = (white_elo + black_elo) / 2
                    games_data.append((avg_elo, game_content))

                current_game = []

            progress_bar.update(1)
        progress_bar.close()

    return pd.DataFrame(games_data, columns=['AverageElo', 'Transcript'])

def sample_games(df, min_elo, max_elo, max_games):
    df = df[(df['AverageElo'] >= min_elo) & (df['AverageElo'] <= max_elo)]
    df = df.sort_values(by='AverageElo')

    increment = (max_elo - min_elo) / max_games
    sampled_games = []

    for i in tqdm(range(max_games), desc="Sampling Games"):
        target_elo = min_elo + i * increment
        closest_game = df.iloc[(df['AverageElo'] - target_elo).abs().argsort()[:1]]
        sampled_games.append(closest_game)

    return pd.concat(sampled_games)

def save_games(games, filename):
    with open(filename, 'w') as file:
        for game in games:
            file.write(game + "\n\n")

url = "https://database.lichess.org/standard/lichess_db_standard_rated_2013-04.pgn.zst" # @param {type:"string"}
MIN_ELO = 1000 # @param {type:"integer"}
MAX_ELO = 3000 # @param {type:"integer"}
MAX_GAMES = 10000 # @param {type:"integer"}
username = "mlabonne" # @param {type:"string"}

compressed_file = "lichess_db_compressed.zst"
decompressed_file = "lichess_db.pgn"
output_file = f"dataset/lichess_{MAX_GAMES//1000}k_{MIN_ELO}-{MAX_ELO}.parquet"

print("Downloading file...")
download_file(url, compressed_file)

print("Decompressing file...")
decompress_file(compressed_file, decompressed_file)

print("Parsing games and creating DataFrame...")
games_df = parse_and_store_games(decompressed_file)

print("Sampling games...")
sampled_df = sample_games(games_df, MIN_ELO, MAX_ELO, MAX_GAMES)

print("Save as HF Dataset...")
dataset = Dataset.from_pandas(sampled_df)
dataset = dataset.remove_columns(["__index_level_0__"])
dataset.to_parquet(output_file)

# Defined in the secrets tab in Google Colab
api = HfApi()

# Create empty repo
create_repo(
    repo_id = f"{username}/chessllm",
    repo_type="dataset",
    exist_ok=True,
)

# Upload gguf files
api.upload_folder(
    repo_id=f"{username}/chessllm",
    repo_type="dataset",
    folder_path="dataset",
)

sampled_df

mkdir: cannot create directory ‘dataset’: File exists
Downloading file...
Decompressing file...
Parsing games and creating DataFrame...


Parsing Games:   0%|          | 0/2841051 [00:00<?, ?it/s]

Sampling games...


Sampling Games:   0%|          | 0/10000 [00:00<?, ?it/s]

Save as HF Dataset...


Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

lichess_10k_1000-3000.parquet:   0%|          | 0.00/556k [00:00<?, ?B/s]

Unnamed: 0,AverageElo,Transcript
86945,1001.0,"[Event ""Rated Bullet game""]\n[Site ""https://li..."
86945,1001.0,"[Event ""Rated Bullet game""]\n[Site ""https://li..."
86945,1001.0,"[Event ""Rated Bullet game""]\n[Site ""https://li..."
86945,1001.0,"[Event ""Rated Bullet game""]\n[Site ""https://li..."
86945,1001.0,"[Event ""Rated Bullet game""]\n[Site ""https://li..."
...,...,...
45763,2335.5,"[Event ""Rated Bullet game""]\n[Site ""https://li..."
45763,2335.5,"[Event ""Rated Bullet game""]\n[Site ""https://li..."
45763,2335.5,"[Event ""Rated Bullet game""]\n[Site ""https://li..."
45763,2335.5,"[Event ""Rated Bullet game""]\n[Site ""https://li..."
