In [None]:
import io
import json
import zstandard as zstd
import chess
from stockfish import Stockfish

In [None]:


# Parámetros generales 
ZST_FILE = "lichess_db_standard_rated_2025-03.pgn.zst"  
OUTPUT_JSON = "180+2_evaluadas_10x200.json"
STOCKFISH_PATH = "stockfish/stockfish-windows-x86-64-avx2.exe"
CHECKPOINT_TEMPLATE = "sample_checkpoint_200_{count}.json"
MIN_ELO, MAX_ELO = 800, 2800
NUM_BUCKETS = 10
BUCKET_SIZE = (MAX_ELO - MIN_ELO) // NUM_BUCKETS  # =200
SAMPLE_PER_BUCKET = 200


engine = Stockfish(path=STOCKFISH_PATH, depth=10, parameters={"Threads": 2, "Hash": 128})

# Inicializar depósitos y contadores (igual)
buckets = [[] for _ in range(NUM_BUCKETS)]
counts = [0] * NUM_BUCKETS
filled = [False] * NUM_BUCKETS
seen = 0


In [None]:
# Función de análisis
def analyze_and_package(game):
    w_elo = int(game.headers.get("WhiteElo", 0))
    b_elo = int(game.headers.get("BlackElo", 0))
    board = game.board()
    moves, evals = [], []
    for move in game.mainline_moves():
        uci = move.uci()
        board.push(move)
        engine.set_fen_position(board.fen())
        info = engine.get_evaluation()
        if info["type"] == "cp":
            score = info["value"] / 100.0
        else:
            score = 100.0 if info["value"] > 0 else -100.0
        moves.append(uci)
        evals.append(score)

    result = game.headers.get("Result", "1/2-1/2")
    if result == "1-0":
        winner = "white"
    elif result == "0-1":
        winner = "black"
    else:
        winner = "draw"

    print(f"Partida analizada: {game.headers.get('White')} vs {game.headers.get('Black')} - Ganador: {winner}")
    return [w_elo, b_elo, winner, moves, evals]

# Helper para guardar checkpoint
def save_checkpoint(dataset, count):
    filename = CHECKPOINT_TEMPLATE.format(count=count)
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(dataset, f, ensure_ascii=False)
    print(f"Checkpoint guardado: {filename} con {len(dataset)} partidas")

In [None]:

try:
    with open(ZST_FILE, 'rb') as fh:  # Abrir archivo local
        dctx = zstd.ZstdDecompressor()
        reader = dctx.stream_reader(fh)
        pgn_stream = io.TextIOWrapper(reader, encoding='utf-8', errors='ignore')

        while True:
            game = chess.pgn.read_game(pgn_stream)
            if game is None:
                print("Final del archivo PGN.")
                break

            # Resto del procesamiento igual
            seen += 1
            if game.headers.get("TimeControl", "") != "180+2":
                continue

            try:
                w_elo = int(game.headers.get("WhiteElo", 0))
                b_elo = int(game.headers.get("BlackElo", 0))
            except ValueError:
                continue

            if not (MIN_ELO <= w_elo <= MAX_ELO and MIN_ELO <= b_elo <= MAX_ELO):
                continue

            if len(list(game.mainline_moves())) < 5:
                continue

            avg_elo = (w_elo + b_elo) / 2
            idx = int((avg_elo - MIN_ELO) // BUCKET_SIZE)
            idx = max(0, min(NUM_BUCKETS - 1, idx))
            counts[idx] += 1

            if not filled[idx]:
                entry = analyze_and_package(game)
                buckets[idx].append(entry)
                total_saved = sum(len(bucket) for bucket in buckets)
                ln = len(buckets[idx])
                if ln in (1, 100, 500, SAMPLE_PER_BUCKET):
                    print(f"[Bucket {idx+1}] {ln}/{SAMPLE_PER_BUCKET}")
                if total_saved % 200 == 0:
                    current_dataset = []
                    for b in buckets:
                        current_dataset.extend(b)
                    save_checkpoint(current_dataset, total_saved)
                if ln >= SAMPLE_PER_BUCKET:
                    filled[idx] = True
                    print(f"[Bucket {idx+1}] ¡LLENO!")

            if all(filled):
                print(f"Completados tras ver {seen} partidas.")
                break

except FileNotFoundError:
    print(f"Error: No se encuentra el archivo {ZST_FILE}")
except Exception as e:
    print(f"Error procesando el archivo: {e}")
    
    

# Unir y guardar JSON final
dataset = []
for i, bucket in enumerate(buckets):
    print(f"Bucket {i+1}: {len(bucket)} partidas, vistas {counts[i]}")
    dataset.extend(bucket)

with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
    json.dump(dataset, f, ensure_ascii=False)
print(f"Guardado final: {OUTPUT_JSON} con {len(dataset)} partidas")


Partida analizada: SantiagoBA2023 vs FrostySnowman - Ganador: black
[Bucket 6] 1/200
Partida analizada: hikjdb vs joaozinhogibi - Ganador: black
Partida analizada: yurianis vs zRonnyPineda - Ganador: black
[Bucket 5] 1/200
Partida analizada: Hecho_en_CU vs JorisO - Ganador: white
[Bucket 3] 1/200
Partida analizada: scottkerr12 vs mazares - Ganador: black
Partida analizada: Ezzio_Auditore vs ionlymovepons - Ganador: black
[Bucket 7] 1/200
Partida analizada: Mayson31 vs seedsdod - Ganador: black
[Bucket 4] 1/200
Partida analizada: klasik_bir_gitar vs jewgienij - Ganador: black
Partida analizada: Alijnoud vs Saergut - Ganador: draw
Partida analizada: Tims101 vs aboalwan - Ganador: black
Partida analizada: Labernet vs Mourad686Abdelaziz - Ganador: black
Partida analizada: noenemies17 vs Clayboard - Ganador: white
Partida analizada: cartanya vs AlexisIriarte - Ganador: white
Partida analizada: PremierGarbage vs mrnoname87 - Ganador: black
Partida analizada: IPlayForTheKing vs nuncarezei - G

KeyboardInterrupt: 