In [9]:
import chess.pgn

def filter_games(pgn_file, contains_eval=False, max_rating_diff=None, game_count=None):
    games_exported = 0
    games_skipped = 0

    with open(pgn_file) as f_in, open("filtered_games.pgn", "w") as f_out:
        while True:
            # Read the next chunk of games from the PGN file
            games_chunk = []
            for _ in range(1000):  # Adjust chunk size as needed
                game = chess.pgn.read_game(f_in)
                if game is None:
                    break
                games_chunk.append(game)

            if not games_chunk:
                break

            # Process each game in the chunk
            for game in games_chunk:
                # Check if the game contains evaluation data
                if contains_eval and not contains_evaluation(game):
                    games_skipped += 1
                    continue
                
                # Check if the rating difference is within the specified limit
                if max_rating_diff is not None:
                    white_rating = int(game.headers.get('WhiteElo', 0))
                    black_rating = int(game.headers.get('BlackElo', 0))
                    rating_diff = abs(white_rating - black_rating)
                    if rating_diff > max_rating_diff:
                        games_skipped += 1
                        continue

                # Write the game to the output file
                f_out.write(str(game) + "\n")
                games_exported += 1

                # Check if the maximum game count has been reached
                if game_count is not None and games_exported >= game_count:
                    break

            # Check if the maximum game count has been reached
            if game_count is not None and games_exported >= game_count:
                break

    print(f"Games exported: {games_exported}")
    print(f"Games skipped: {games_skipped}")

def contains_evaluation(game):
    # Iterate through moves and check if any move contains evaluation data
    for node in game.mainline():
        if "[%eval" in node.comment:
            return True
    return False


# Example usage
pgn_file_path = "lichess_db_standard_rated_2020-01.pgn"  # Replace with the path to your PGN file
filter_games(pgn_file_path, contains_eval=True, max_rating_diff=2000, game_count=5000)


Games exported: 5000
Games skipped: 88073
