# Process Data for veRL
---
High level what we need here:
- veRL requires the data in parquet format (type of data file)
- The data needs to be stored in a specific format w/ specific column names
- We also need to make sure to apply certain json.dumps calls to make sure the nested dicts are loadable in veRL

In [7]:
import os
import ast
import json
import random
import numpy as np
import pandas as pd
from typing import List
from scipy import stats
from collections import defaultdict
import ast
import pandas as pd
import chess

In [8]:
# First need to load in our data (csv) -- use this function. Can also specify # of samples to load in
DATA_ROOT = os.path.abspath(os.path.join(os.path.abspath(os.getcwd()), ".."))

def _load_challenge_moves_csv(filename: str, shuffle: bool = True, max_samples: int = None) -> pd.DataFrame:
    """
    Loads a CSV file into a pandas DataFrame, converts list-like string columns into actual lists,
    removes single apostrophes from 'Move' column values, and optionally shuffles the DataFrame.
    Allows limiting the number of rows returned.

    Args:
        filename (str): Name of csv file in the 'raw_data' folder.
        shuffle (bool): Whether to shuffle the DataFrame (default is True).
        max_samples (int, optional): Maximum number of rows to return. If None, returns all rows.

    Returns:
        pd.DataFrame: The processed DataFrame.
    """
    # Get 'data_root' using absolute paths and moving back one folder
    df = pd.read_csv(os.path.join(DATA_ROOT, "raw_data", filename))

    # Convert the columns from strings to lists (using ast.literal_eval)    
    df["Move"] = df["Move"].apply(lambda x: [move.replace("'", "") for move in ast.literal_eval(x)])
    df["Win Probability"] = df["Win Probability"].apply(ast.literal_eval)

    # Optional processing (based on args)
    if shuffle:
        df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    if max_samples is not None:
        df = df.head(max_samples)

    return df



In [9]:
# Support for various board/move formats

def fen_to_description(fen: str) -> str:
    """
    Converts a FEN (Forsyth-Edwards Notation) string into a human-readable chessboard description.

    Args:
        fen (str): The FEN string representing the board state.

    Returns:
        str: A formatted description of the board state.
    """
    piece_map = {
        "K": "King",
        "Q": "Queen",
        "R": "Rook",
        "B": "Bishop",
        "N": "Knight",
        "P": "Pawn",
        "k": "King",
        "q": "Queen",
        "r": "Rook",
        "b": "Bishop",
        "n": "Knight",
        "p": "Pawn",
    }

    try:
        fen_parts = fen.split()
        if len(fen_parts) < 2:
            raise ValueError(
                "Invalid FEN format. Ensure it has at least a board position and turn information."
            )

        ranks = fen_parts[0].split("/")
        if len(ranks) != 8:
            raise ValueError("Invalid FEN format. The board should have 8 ranks.")

        turn = "White to move." if fen_parts[1] == "w" else "Black to move."
        board = []

        for r, rank in enumerate(ranks):
            row = []
            file = 0
            for char in rank:
                if char.isdigit():
                    file += int(char)
                elif char in piece_map:
                    row.append((char, file, 8 - r))  # (Piece, File, Rank)
                    file += 1
                else:
                    raise ValueError(f"Invalid character '{char}' in FEN notation.")
            board.extend(row)

        piece_positions = defaultdict(list)

        for piece, file, rank in board:
            color = "White" if piece.isupper() else "Black"
            piece_type = piece_map[piece]
            position = f"{chr(file + 97)}{rank}"
            piece_positions[(color, piece_type)].append(position)

        description = [turn]

        for (color, piece_type), positions in sorted(
            piece_positions.items(), key=lambda x: (x[0][0], x[0][1])
        ):
            position_text = ", ".join(positions)
            description.append(
                f"{color} {piece_type}{'s' if len(positions) > 1 else ''} on {position_text}."
            )

        return "\n".join(description)

    except ValueError as e:
        return f"Error processing FEN: {e}"
    except Exception as e:
        return f"Unexpected error: {str(e)}"

def convert_uci_moves_to_pgn(fen, uci_moves):
    """
    Convert UCI moves to PGN move notation based on a given FEN position.
    """
    board = chess.Board(fen)
    pgn_moves = []
    
    for uci in uci_moves:
        move = chess.Move.from_uci(uci)
        if move in board.legal_moves:
            pgn_moves.append(board.san(move)) 

    return pgn_moves
leg = ["f5h7", "h8h6", "h2h4", "h8g8", "h2h3", "g1f1", "f5d3", "f5h3", "h8b8", "h8h4", 
       "h8c8", "h8f8", "h8a8", "h8d8", "f5g4", "h8h3", "g3g4", "g1h1", "f5e4", "h8h5", 
       "f5c2", "h8e8", "f5g6", "h8h7"]
print(convert_uci_moves_to_pgn("7R/4n1k1/4P3/1pp2B2/8/6P1/2r4P/6K1 w - - 0 1", leg))

['Bh7', 'Rh6', 'h4', 'Rg8+', 'h3', 'Kf1', 'Bd3', 'Bh3', 'Rb8', 'Rh4', 'Rc8', 'Rf8', 'Ra8', 'Rd8', 'Bg4', 'Rh3', 'g4', 'Kh1', 'Be4', 'Rh5', 'Bxc2', 'Re8', 'Bg6', 'Rh7+']


In [None]:
# Other data loader for move notation
def load_challenge_moves_csv(filename: str, move_notation: str = 'UCI', shuffle: bool = True, max_samples: int = None) -> pd.DataFrame:
    """
    Loads a CSV file into a pandas DataFrame, converts list-like string columns into actual lists,
    removes single apostrophes from 'Move' column values, and optionally shuffles the DataFrame.
    
    Also converts UCI moves to PGN if 'move_notation' is 'UCI'.

    Args:
        filepath (str): Path to the CSV file.
        move_notation (str): Type of notation to process ('UCI' or 'PGN').
        shuffle (bool): Whether to shuffle the DataFrame (default is True).

    Returns:
        pd.DataFrame: The processed DataFrame.
    """
    df = pd.read_csv(os.path.join(DATA_ROOT, "raw_data", filename))

    # Convert the columns from strings to lists (using ast.literal_eval)
    df["Move"] = df["Move"].apply(lambda x: [move.replace("'", "") for move in ast.literal_eval(x)])
    df["Win Probability"] = df["Win Probability"].apply(ast.literal_eval)

    if move_notation == 'PGN':
        # Apply conversion
        df["Move"] = df.apply(lambda row: convert_uci_moves_to_pgn(row["FEN"], row["Move"]), axis=1)

    if shuffle:
        df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    if max_samples is not None:
        df = df.head(max_samples)

    return df

In [None]:
# Various processing functions
# ========================================================

# Top-level processing function that we can apply to each row of our csv dataset
def process_fn(example, idx, split):
    """
    Processes a single row in the dataset.

    Args:
        example (pd.Series): A row from the DataFrame.
        idx (int): The index of the row.
        split (str): The dataset split ('train' or 'test').

    Returns:
        dict: Processed row in the desired format.
    """
    question = format_prompt(board=example['FEN'], legal_moves=example['Move'])
    solution = create_reward_dict(move=example['Move'], win_prob=example['Win Probability'], reward_name="zscore")

    return {
        "data_source": "chess_reasoning",
        "prompt": [{
            "role": "user",
            "content": question,
        }],
        "ability": "math",
        "reward_model": {
            "style": "rule",
            "ground_truth": solution
        },
        "extra_info": {
            'split': split,
            'index': idx
        }
    }

# Constants for reward functions
# ========================================================
CLIPPING_THRESHOLD = 0.4

def reward_clipped(legal_moves, win_probabilities):
    z_scores = stats.zscore(win_probabilities)
    rewards = clipped_normalize_rewards(z_scores, CLIPPING_THRESHOLD)
    return dict(zip(legal_moves, rewards))

def reward_zscore(legal_moves, win_probabilities):
    z_scores = stats.zscore(win_probabilities)
    return dict(zip(legal_moves, normalize_rewards(z_scores)))


def normalize_rewards(z_scores):
    z_min = np.min(z_scores)
    z_max = np.max(z_scores)
    if z_min == z_max:
        return [1]
    return (z_scores - z_min) / (z_max - z_min)


def clipped_normalize_rewards(z_scores, threshold=0.5):
    z_scores = np.array(z_scores, dtype=float)

    # Step 1: Normalize to range [0, 1]
    z_min = np.min(z_scores)
    z_max = np.max(z_scores)
    
    if np.isclose(z_min, z_max):  # Avoid division by zero
        return np.zeros_like(z_scores)

    normalized = (z_scores - z_min) / (z_max - z_min)

    # Step 2: Apply threshold (set values < threshold to 0)
    normalized[normalized < threshold] = 0

    # Step 3: Renormalize to [0, 1] if necessary
    new_min = np.min(normalized)
    new_max = np.max(normalized)

    if np.isclose(new_min, new_max):  # If all remaining values are the same, return as is
        return np.zeros_like(normalized) if new_max == 0 else np.ones_like(normalized)

    return (normalized - new_min) / (new_max - new_min)



# Functions to create our reward dict -- can update this to try different reward funcs
def create_reward_dict(move: List[str], win_prob: List[float], reward_name: str = '') -> dict:
    """
    Takes in two lists -- a list of legal moves and a list of associated win probabilities.
    Zips them together into a NumPy array, normalizes the win probabilities using min-max scaling 
    so that they lie between 0 and 1, and returns a dictionary mapping each move to its normalized win probability.
    """
    if reward_name == "clipped":
        return reward_clipped(move, win_prob)
    elif reward_name == "zscore":
        return reward_zscore(move, win_prob)

    # Create a numpy array from the zipped moves and win probabilities.
    arr = np.array(list(zip(move, win_prob)), dtype=object)
    
    # Extract the win probability values and convert to float.
    win_probs = np.array(arr[:, 1], dtype=float)
    
    # Apply min-max normalization.
    min_val = win_probs.min()
    max_val = win_probs.max()
    if max_val - min_val > 0:
        normalized_win_probs = (win_probs - min_val) / (max_val - min_val)
    else:
        # If all values are the same, set them to 0.5.
        normalized_win_probs = np.full_like(win_probs, 0.5)
    
    # Create a dictionary mapping each move to its normalized win probability.
    reward_dict = {m: float(p) for m, p in zip(arr[:, 0], normalized_win_probs)}
    return reward_dict


# Functions to process / output our prompt from the initial data in the master csv
SYSTEM_PROMPT = """<|im_start|>system
You are a smart, strategic, and wise chess reasoning model. You are currently in a chess tournament where you have 1 minute to make a move.

We will provide you with a board in Forsyth-Edwards Notation (FEN) and a list of legal moves. Your task is to reason through the board state and determine an optimal move based on your analysis.

The reasoning process and answer must be enclosed within <think> </think> and <answer> </answer> tags, respectively. For example, when given an input prefixed with "user:", your response should be in the format "assistant: <think> [your reasoning] </think> <answer> [chosen move] </answer>".

Below is an example of your desired behavior:

Example 1:
user: <FEN> 7R/4n1k1/4P3/1pp2B2/8/6P1/2r4P/6K1 w - - 3 50 </FEN> <legal moves> [f5h7, h8h6, h2h4, h8g8, h2h3, g1f1, f5d3, f5h3, h8b8, h8h4, h8c8, h8f8, h8a8, h8d8, f5g4, h8h3, g3g4, g1h1, f5e4, h8h5, f5c2, h8e8, f5g6, h8h7] </legal moves>
assistant: <think> Playing as white, I'm in the offensive here. My rook is currently in at risk of being taken by their king and my bishop is at risk of being taken by their knight. I could take their rook with their bishop but they would take my rook. However, if I move my rook to h7, I'll put their king in check while saving my rook and bishop and continue pressure. Moving rook h8 to h7 is a wise move. </think> <answer> h8h7 </answer>

Make sure that your chosen move is in standard chess notation (such as 'g8f7' -- which means you move the piece from g8 to f7). 

Use English for your thought process. Remember you have one minute to move so be quick.<|im_end|>"""

SYSTEM_PROMPT_DESC = """<|im_start|>system
You are a smart, strategic, and wise chess reasoning model currently in a chess tournament where you have 1 minute to make a move.

Given a chess board and a list of legal moves, you think through the various moves you can make and reason about which move is the best. Then you provide your final move back to the user based on your reasoning analysis.

The reasoning process and answer must be enclosed within <think> </think> and <answer> </answer> tags, respectively. For example, when given an input prefixed with "user:", your response should be in the format "assistant: <think> [your reasoning] </think> <answer> [chosen move] </answer>".

Below is an example of your desired behavior:

Example 1:
user: <board> White to move.
Black King on g7
Black Knight on e7
Black Pawns on b5, c5
Black Rook on c2
White Bishop on f5
White King on g1
White Pawns on e6, g3, h2
White Rook on h8 </board> <legal moves> [f5h7, h8h6, h2h4, h8g8, h2h3, g1f1, f5d3, f5h3, h8b8, h8h4, h8c8, h8f8, h8a8, h8d8, f5g4, h8h3, g3g4, g1h1, f5e4, h8h5, f5c2, h8e8, f5g6, h8h7] </legal moves>
assistant: <think> Playing as white, I'm in the offensive here. My rook is currently in at risk of being taken by their king and my bishop is at risk of being taken by their knight. I could take their rook with their bishop but they would take my rook. However, if I move my rook to h7, I'll put their king in check while saving my rook and bishop and continue pressure. Moving rook h8 to h7 is a wise move. </think> <answer> h8h7 </answer>

Make sure that your chosen move is provided in standard chess bot notation, where you give the square of the piece you want to move and the square you want to move it to.
For example, if you want to move the piece on c2 to e1, this would be c2e1. The list of legal moves are given in the same format.

Please use English for your thought process. Remember you have one minute to move so make sure your thinking isn't too long, and if your final answer is not enclosed in <answer>  </answer> tags you will lose.<|im_end|>"""

SYSTEM_PROMPT_DESC_PGN = """<|im_start|>system
You are a smart, strategic, and wise chess reasoning model currently in a chess tournament where you have 1 minute to make a move.

Given a chess board and a list of legal moves, you think through the various moves you can make and reason about which move is the best. Then you provide your final move back to the user based on your reasoning analysis.

The reasoning process and answer must be enclosed within <think> </think> and <answer> </answer> tags, respectively. For example, when given an input prefixed with "user:", your response should be in the format "assistant: <think> [your reasoning] </think> <answer> [chosen move] </answer>".

Below is an example of your desired behavior:

Example 1:
user: <board> White to move.
Black King on g7
Black Knight on e7
Black Pawns on b5, c5
Black Rook on c2
White Bishop on f5
White King on g1
White Pawns on e6, g3, h2
White Rook on h8 </board> <legal moves> [Bh7, Rh6, h4, Rg8+, h3, Kf1, Bd3, Bh3, Rb8, Rh4, Rc8, Rf8, Ra8, Rd8, Bg4, Rh3, g4, Kh1, Be4, Rh5, Bxc2, Re8, Bg6, Rh7+] </legal moves>
assistant: <think> Playing as white, I'm in the offensive here. My rook is currently in at risk of being taken by their king and my bishop is at risk of being taken by their knight. I could take their rook with their bishop but they would take my rook. However, if I move my rook to h7, I'll put their king in check while saving my rook and bishop and continue pressure. Moving rook h8 to h7 is a wise move. </think> <answer> Rh7+ </answer>

Make sure that your chosen move is provided in standard PGN notation.

Please use English for your thought process. Remember you have one minute to move so make sure your thinking isn't too long, and if your final answer is not enclosed in <answer>  </answer> tags you will lose.<|im_end|>"""

def format_prompt(board: str, legal_moves: List[str], board_type: str = 'FEN') -> str:
    """
    Formats the board and legal moves into a prompt for the model.
    
    Args:
        board (str): The current board state.
        legal_moves (List[str]): The list of legal moves.
    
    Returns:
        str: The formatted prompt.
    """
    random.shuffle(legal_moves)
    if board_type == "FEN":
        board_representation = board
        prompt = f"<|im_start|>user: <FEN> {board_representation} </FEN> <legalmoves> {legal_moves} </legalmoves><|im_end|>\n<|im_start|>assistant: "
        prompt = prompt.replace("'", "")
        return SYSTEM_PROMPT + '\n' + prompt
    elif board_type == "desc":
        board_representation = fen_to_description(board)
        prompt = f"<|im_start|>user: <board> {board_representation} </board> <legalmoves> {legal_moves} </legalmoves><|im_end|>\n<|im_start|>assistant: "
        prompt = prompt.replace("'", "")
        return SYSTEM_PROMPT_DESC + '\n' + prompt
    else:
        raise ValueError("Invalid board type. Must be 'FEN' or 'desc'.")

    

In [4]:
# Code to load and process our data -- saves as parquet
experiment_name = "old_system_prompt"
max_samples = 5000   # Let's use 5k max samples for now; no need to do a val set for now as well
test_samples = 256   # Keep as is -- want this smol

# Apply transformation to train and test datasets
# If we want to be super careful we can actually pre-split into train / val to make sure no contamination but imo val results not super important but we need something small
train_df = _load_challenge_moves_csv("chess_challenges_full.csv", shuffle=True, max_samples=max_samples)
train_dataset = train_df.apply(lambda row: process_fn(row, row.name, "train"), axis=1)
train_dataset = pd.DataFrame(train_dataset.tolist())
test_df = _load_challenge_moves_csv("chess_challenges_full.csv", shuffle=True, max_samples=test_samples)
test_dataset = test_df.apply(lambda row: process_fn(row, row.name, "train"), axis=1)
test_dataset = pd.DataFrame(test_dataset.tolist())

# Need to fix due to parquet screwing up dicts
train_dataset["reward_model"] = train_dataset["reward_model"].apply(json.dumps)
test_dataset["reward_model"] = test_dataset["reward_model"].apply(json.dumps)

# Save our parquets down
train_parquet_filename = f"train-{experiment_name}-{max_samples//1000}k.parquet"
test_parquet_filename = f"test-{experiment_name}-{test_samples}.parquet"
train_dataset.to_parquet(os.path.join(DATA_ROOT, "parquet_datasets", train_parquet_filename), index=False)
test_dataset.to_parquet(os.path.join(DATA_ROOT, "parquet_datasets", test_parquet_filename), index=False)