In [1]:
import os
import ast
import json
import random
import pandas as pd
from collections import Counter

from raw.generation_util.parsing import pqt_extract_ground_truth
from raw.utils.board import visual_to_fen, extract_visual
from raw.utils.parsing import parse_fen

In [2]:
task_split = "train"
task_type = "rejsampling"

# parent_dir = f"cleaned/verl_tasks/{task_split}"
# filename = f"{task_type}.parquet"
# filepath = os.path.join(parent_dir, filename)
filename = "rejsampling_predictmove_balanced_5775.jsonl"
filepath = f"cleaned/train_data/{filename}"

# Load data agnostic to parquet / json / jsonl
if filename.endswith('.parquet'):
    df = pd.read_parquet(filepath)
    data = df.to_dict('records')
    print(f"Loaded {len(data)} items from parquet file {filename}.")
elif filename.endswith('.json'):
    with open(filepath, 'r') as f:
        data = json.load(f)
    print(f"Loaded {len(data)} items from JSON file {filename}.")
elif filename.endswith('.jsonl'):
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    print(f"Loaded {len(data)} items from JSONL file {filename}.")
else:
    raise ValueError(f"Unsupported file format: {filename}. Please use .parquet, .json, or .jsonl files.")

random.shuffle(data)

# Need to coerce since we stringified the ground truth dict due to issues w/ parquet saving
# for row in data:
#     row['reward_model']['ground_truth'] = pqt_extract_ground_truth(row['reward_model']['ground_truth'], task_type = task_type)

print(f"Shuffled {len(data)} items.")

Loaded 5775 items from JSONL file rejsampling_predictmove_balanced_5775.jsonl.
Shuffled 5775 items.


In [3]:
random.shuffle(data)

def print_data(datum):
    for d in datum:
        if d[0] == "system":
            continue
        print(f"{d[0]}:\n------------------------\n{d[1]}\n\n")

print_data(data[0]['chat'])

user:
------------------------
Below is a chess board from your current game.

8| r n . q . b n r
7| p . . . p k . p
6| . p . . . . p .
5| . . p . p . . .
4| . . . . . . . .
3| . . . P . . P .
2| P P P . . P . P
1| R N B . K . . .
   _ _ _ _ _ _ _ _
   A B C D E F G H

- It is Black’s turn to move.
- Castling rights: White can castle queenside.
- No en passant target square.
- Halfmove clock: 0
- Fullmove number: 12

You must select the best move from this position and return it within answer tags. Your answer must be formatted as <answer> my_move </answer>, where my_move is a legal move in UCI notation.

Think step by step if necessary, but do not omit the answer tags or UCI format. Only answers in the correct format will be accepted.


assistant:
------------------------
<think>
First, let's analyze the given position:
The current state of the board is:
8| r n . q . b n r
7| p . . . p k . p
6| . p . . . . p .
5| . . p . p . . .
4| . . . . . . . .
3| . . . P . . P .
2| P P P . . P . P

In [3]:
# --- Config ------------------------------------------------------
FULLMOVE_DEFAULT_BUCKETS = [
    ((0, 9),   0.12),
    ((10, 19), 0.30),
    ((20, 29), 0.30),
    ((30, 39), 0.15),
    ((40, 150), 0.13),
]

# --- Collect counts ---------------------------------------------
bucket_labels = [f"{lo}-{hi}" for (lo, hi), _ in FULLMOVE_DEFAULT_BUCKETS]
counts = Counter({lbl: 0 for lbl in bucket_labels})
total  = 0

for row in data:
    fullmove = parse_fen(visual_to_fen(extract_visual(row["prompt"][1]["content"])))["fullmove_number"]
    for (lo, hi), _ in FULLMOVE_DEFAULT_BUCKETS:
        if lo <= fullmove <= hi:
            counts[f"{lo}-{hi}"] += 1
            break
    total += 1

# --- Print table -------------------------------------------------
print(f"{'Bucket':<8} | {'Observed':>8} | {'Desired':>8}")
print("-" * 30)
for (lo, hi), desired in FULLMOVE_DEFAULT_BUCKETS:
    lbl = f"{lo}-{hi}"
    obs = counts[lbl] / total if total else 0
    print(f"{lbl:<8} | {obs:>7.2%} | {desired:>7.2%}")


Bucket   | Observed |  Desired
------------------------------
0-9      |   9.96% |  12.00%
10-19    |  29.98% |  30.00%
20-29    |  29.98% |  30.00%
30-39    |  15.04% |  15.00%
40-150   |  15.04% |  13.00%


In [6]:
# Randomly sample / print data
sample = random.choice(data)
# for key, value in sample.items():
#     print(key)
#     print(type(value))
#     print(f"{value}\n{'-'*60}\n")

sys_prompt  = sample['prompt'][0]
user_prompt = sample['prompt'][1]
ground_truth = sample['reward_model']['ground_truth']

print(f"System:\n{sys_prompt['content']}\n")
print(f"User:\n{user_prompt['content']}\n")
print(f"Ground Truth:\n{ground_truth}")

System:
You are a chess grandmaster currently playing against a strong opponent. Assume they will capitalize on any blunders you make.

You are exceptional at thinking through various board states and strategically planning your next move. You are detail oriented, strategic, and an efficient thinker. You use all of these traits to be an effective chess player.

Do not hallucinate. All statements should be based on the provided board and you must avoid considering pieces that do not exist or moves that are illegal.

You must refer to moves in UCI notation (e.g., d7d5) and include your thinking in think tags (e.g., <think> your_thinking </think>) and your answer in answer tags (e.g., <answer> your_answer </answer>). If you do not need to reason heavily, you should still include think tags but with '\n\n' (e.g., <think> \n\n <\think>).

Finally, make sure to always think out loud (using the <think> tags) to convey your thought process as you consider various moves and solutions to puzzles

## Code to clean up / concatenate existing data

In [None]:
# # Example usage:
# parent_dir = "explainer_data"
# filenames = [
#     "explainer_clean_100_1558_15.parquet",
#     "explainer_clean_1250.parquet",
#     "explanations_0_1000_0104_16.parquet",
#     "explanations_1_1000_0330_16.parquet",
#     "explanations_2_1000_0557_16.parquet",
#     "explanations_3_1000_0826_16.parquet"
# ]
# filepaths = [os.path.join(parent_dir, fn) for fn in filenames]
# output_file = "combined_chessexplainer.jsonl"
# system_prompt_file = "chess_task_sysprompt.txt"  # Or "llama4_default_sysprompt.txt"


# HEADER_PATTERN = re.compile(
#     r"<\|start_header_id\|>(\w+)<\|end_header_id\|>\n?(.*?)(?=(<\|start_header_id\|>|\Z|<\|eot_id\|>))",
#     re.DOTALL
# )

# def extract_dialogue(sample):
#     text = sample['prompt']
#     result = []
#     for match in HEADER_PATTERN.finditer(text):
#         role, content = match.group(1), match.group(2).strip()
#         # Remove Llama tags and both eot_id types
#         content = re.sub(r"<\|.*?\|>|<eot_id>", "", content).strip()
#         if role == "system":
#             content = "chess_task_sysprompt.txt"
#         if role == "user":
#             prefix = "Here is a board in a game you're currently playing. I want you to think through some possible moves you could make and how those moves will likely play out. You may find it helpful to roll-out each line assuming your opponent plays near-optimally. You may also find it helpful to consider the value of the final board state after each roll-out.\n\nAfter you think through your various moves, please end by telling me your chosen move (in UCI notation) within answer tags.\n\n"
#             content = prefix + content
#         if content:
#             result.append((role, content))
#     completion = sample.get('completion', '').strip()
#     completion = re.sub(r'(<\|eot_id\|>|<eot_id>)\s*$', '', completion).strip()
#     if completion:
#         result.append(('assistant', completion))
#     return result

# def convert_and_save(parquet_paths, output_path):
#     all_dialogues = []
#     for path in parquet_paths:
#         df = pd.read_parquet(path)
#         all_dialogues.extend(
#             {"chat": extract_dialogue(row)} for row in df.to_dict('records')
#         )
#     # Save as JSONL
#     with open(output_path, 'w', encoding='utf-8') as f:
#         for d in all_dialogues:
#             json.dump(d, f, ensure_ascii=False)
#             f.write('\n')
#     print(f"Saved {len(all_dialogues)} dialogues to {output_path}")


# output_path = os.path.join(parent_dir, output_file)
# convert_and_save(filepaths, output_path)

## Code to Count Length of Responses in the Model Responses Folder

In [5]:
import os
import json

BASE_DIR = "cleaned/model_responses"

for folder in os.listdir(BASE_DIR):
    folder_path = os.path.join(BASE_DIR, folder)
    if not os.path.isdir(folder_path):
        continue

    response_lengths = []
    for file in os.listdir(folder_path):
        if not file.endswith(".json"):
            continue

        file_path = os.path.join(folder_path, file)
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
            for elem in data:
                resp = elem.get("model_response", "")
                response_lengths.append(len(resp))

    if response_lengths:
        mean_length = sum(response_lengths) / len(response_lengths)
        print(f"{folder}: {mean_length:.2f}")
    else:
        print(f"{folder}: No model responses found.")


llmchess-llama31-8b-400: 1052.18
llmchess-llama31-8b-sft-mmxl-400: 3376.89
llmchess-llama33-70b-400: 1957.45
llmchess-llama4-scout-400: 1824.58
llmchess-qwen25-7b-400: 917.69
llmchess-qwen25-7b-grpo-datamix-2-400: 2513.86
llmchess-qwen25-7b-grpo-mmxl-400: 2932.48
llmchess-qwen25-7b-sft-dm2-v2-400: 4330.30
llmchess-qwen25-7b-sft-mmxl-400: 4130.17


# Code to split my XL_Dataset (15mm samples, 10GB) into 10 equal sized chunks to be uploaded to the S3 instance

In [2]:
"""
Split a ~10 GB JSON array (~15 M rows) into 10 ≈1 GB shards
without ever holding more than a handful of objects in RAM.

– Input   :  ./llamafactory_programmatic_15mm.json
             ⟶ one big JSON array `[ { … }, { … }, … ]`
– Outputs :  preprocessed_train_data/xl_run/xl_sft_p<N>/
               ├─ llamafactory_programmatic_<rows>.json   (the shard)
               └─ dataset_info.json                       (schema-meta)

Requires:  ijson  (streaming JSON parser)
"""

from __future__ import annotations
import json, math, os, shutil
from pathlib import Path

import ijson


##############################################################################
# Configuration
##############################################################################

IN_FILE   = Path("llamafactory_programmatic_8984647.json")
OUT_ROOT  = Path("preprocessed_train_data/new_uploads/exp_29")
NUM_PARTS = 4  # target number of ~1 GB shards
RUN_SPLIT = True


##############################################################################
# Helper – stream-count objects (cheap in RAM, just slow I/O)
##############################################################################

def count_rows(path: Path) -> int:
    n = 0
    with path.open("rb") as fh:
        for _ in ijson.items(fh, "item"):   # top-level “item” = each dict
            n += 1
    return n


##############################################################################
# Helper – open/close a shard in streaming-write mode
##############################################################################

def open_shard(index: int) -> tuple[os.PathLike, object, int]:
    """
    Create   preprocessed_train_data/xl_run/xl_sft_p<index>/temp.json
    and write initial “[”.
    """
    shard_dir = OUT_ROOT / f"xl_sft_p{index}"
    shard_dir.mkdir(parents=True, exist_ok=True)

    temp_path = shard_dir / "tmp.json"
    fh = temp_path.open("w", encoding="utf-8")
    fh.write("[")           # start JSON array

    return shard_dir, fh, 0   # last value == objects written so far


def close_shard(shard_dir: Path, fh, rows_written: int) -> None:
    """
    Finish JSON array, rename file to include row-count, drop dataset_info.json.
    """
    fh.write("]\n")
    fh.close()

    final_name = f"llamafactory_programmatic_{rows_written}.json"
    final_path = shard_dir / final_name
    os.rename(shard_dir / "tmp.json", final_path)

    info = {
        "llmchess_programmatic": {
            "file_name": final_name,
            "columns": {
                "system": "system",
                "prompt": "user",
                "response": "assistant"
            }
        }
    }
    with (shard_dir / "dataset_info.json").open("w", encoding="utf-8") as f:
        json.dump(info, f, indent=2)


##############################################################################
# Main
##############################################################################

def split_big_json() -> None:
    OUT_ROOT.mkdir(parents=True, exist_ok=True)

    total_rows = count_rows(IN_FILE)
    rows_per_part = math.ceil(total_rows / NUM_PARTS)

    shard_idx              = 1
    shard_dir, out_fh, n   = open_shard(shard_idx)
    first_in_shard         = True

    with IN_FILE.open("rb") as in_fh:
        for obj in ijson.items(in_fh, "item"):
            # rotate shard when full
            if n == rows_per_part:
                close_shard(shard_dir, out_fh, n)
                shard_idx += 1
                shard_dir, out_fh, n = open_shard(shard_idx)
                first_in_shard = True

            # streaming-write the current object
            if not first_in_shard:
                out_fh.write(",\n")
            json.dump(obj, out_fh)
            first_in_shard = False
            n += 1

    # final shard
    close_shard(shard_dir, out_fh, n)


if RUN_SPLIT:
    split_big_json()
