In [1]:
import zstandard as zstd
import io
import chess.pgn
import pandas as pd
from tqdm import tqdm

In [2]:
def stream_games_from_zst(file_path, max_games=None):
    """Generator that yields chess.pgn.Game objects from a .pgn.zst file"""
    with open(file_path, 'rb') as fh:
        dctx = zstd.ZstdDecompressor(max_window_size=2147483648)
        with dctx.stream_reader(fh) as reader:
            text_stream = io.TextIOWrapper(reader, encoding='utf-8')
            game_counter = 0
            while True:
                game = chess.pgn.read_game(text_stream)
                if game is None:
                    break
                game_counter += 1
                yield game
                if max_games and game_counter >= max_games:
                    break
def parse_zst_to_csv(input_zst, output_csv, min_moves=5, max_games=None):
    """Parse PGN games from .zst and save structured data to CSV"""
    games_data = []

    for i, game in enumerate(tqdm(stream_games_from_zst(input_zst, max_games))):
        headers = game.headers
        event = headers.get("Event", "")
        site = headers.get("Site", "")
        white = headers.get("White", "")
        black = headers.get("Black", "")
        result = headers.get("Result", "")
        white_elo = headers.get("WhiteElo", "")
        black_elo = headers.get("BlackElo", "")
        eco = headers.get("ECO", "")
        opening = headers.get("Opening", "")
        utc_date = headers.get("UTCDate", "")
        utc_time = headers.get("UTCTime", "")
        termination = headers.get("Termination", "")

        # Extract move list
        moves = []
        node = game
        while node.variations:
            next_node = node.variation(0)
            moves.append(next_node.san())
            node = next_node

        if len(moves) < min_moves:
            continue  # Skip short games

        games_data.append({
            "Event": event,
            "Site": site,
            "White": white,
            "Black": black,
            "Result": result,
            "WhiteElo": white_elo,
            "BlackElo": black_elo,
            "ECO": eco,
            "Opening": opening,
            "NumMoves": len(moves),
            "Moves": " ".join(moves),
            "UTCDate": utc_date,
            "UTCTime": utc_time,
            "Termination": termination
        })

    df = pd.DataFrame(games_data)
    df.to_csv(output_csv, index=False)
    print(f"✅ Saved {len(df)} games to {output_csv}")

In [4]:
# Example usage:
# Adjust max_games if you just want to test first.
parse_zst_to_csv("Dataset\lichess_db_standard_rated_2015-12.pgn.zst", "parsed_games.csv", min_moves=5, max_games=200000)


  parse_zst_to_csv("Dataset\lichess_db_standard_rated_2015-12.pgn.zst", "parsed_games.csv", min_moves=5, max_games=200000)
200000it [1:27:54, 37.92it/s] 


✅ Saved 197665 games to parsed_games.csv
