# Process Data for veRL
---
High level what we need here:
- veRL requires the data in parquet format (type of data file)
- The data needs to be stored in a specific format w/ specific column names
- We also need to make sure to apply certain json.dumps calls to make sure the nested dicts are loadable in veRL

In [1]:
import os
import ast
import json
import random
import numpy as np
import pandas as pd
from typing import List

In [2]:
# First need to load in our data (csv) -- use this function. Can also specify # of samples to load in
DATA_ROOT = os.path.abspath(os.path.join(os.path.abspath(os.getcwd()), ".."))

def _load_challenge_moves_csv(filename: str, shuffle: bool = True, max_samples: int = None) -> pd.DataFrame:
    """
    Loads a CSV file into a pandas DataFrame, converts list-like string columns into actual lists,
    removes single apostrophes from 'Move' column values, and optionally shuffles the DataFrame.
    Allows limiting the number of rows returned.

    Args:
        filename (str): Name of csv file in the 'raw_data' folder.
        shuffle (bool): Whether to shuffle the DataFrame (default is True).
        max_samples (int, optional): Maximum number of rows to return. If None, returns all rows.

    Returns:
        pd.DataFrame: The processed DataFrame.
    """
    # Get 'data_root' using absolute paths and moving back one folder
    df = pd.read_csv(os.path.join(DATA_ROOT, "raw_data", filename))

    # Convert the columns from strings to lists (using ast.literal_eval)    
    df["Move"] = df["Move"].apply(lambda x: [move.replace("'", "") for move in ast.literal_eval(x)])
    df["Win Probability"] = df["Win Probability"].apply(ast.literal_eval)

    # Optional processing (based on args)
    if shuffle:
        df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    if max_samples is not None:
        df = df.head(max_samples)

    return df

In [3]:
# Various processing functions
# ========================================================

# Top-level processing function that we can apply to each row of our csv dataset
def process_fn(example, idx, split):
    """
    Processes a single row in the dataset.

    Args:
        example (pd.Series): A row from the DataFrame.
        idx (int): The index of the row.
        split (str): The dataset split ('train' or 'test').

    Returns:
        dict: Processed row in the desired format.
    """
    question = format_prompt(board=example['FEN'], legal_moves=example['Move'])
    solution = create_reward_dict(move=example['Move'], win_prob=example['Win Probability'])

    return {
        "data_source": "chess_reasoning",
        "prompt": [{
            "role": "user",
            "content": question,
        }],
        "ability": "math",
        "reward_model": {
            "style": "rule",
            "ground_truth": solution
        },
        "extra_info": {
            'split': split,
            'index': idx
        }
    }


# Functions to create our reward dict -- can update this to try different reward funcs
def create_reward_dict(move: List[str], win_prob: List[float]) -> dict:
    """
    Takes in two lists -- a list of legal moves and a list of associated win probabilities.
    Zips them together into a NumPy array, normalizes the win probabilities using min-max scaling 
    so that they lie between 0 and 1, and returns a dictionary mapping each move to its normalized win probability.
    """
    # Create a numpy array from the zipped moves and win probabilities.
    arr = np.array(list(zip(move, win_prob)), dtype=object)
    
    # Extract the win probability values and convert to float.
    win_probs = np.array(arr[:, 1], dtype=float)
    
    # Apply min-max normalization.
    min_val = win_probs.min()
    max_val = win_probs.max()
    if max_val - min_val > 0:
        normalized_win_probs = (win_probs - min_val) / (max_val - min_val)
    else:
        # If all values are the same, set them to 0.5.
        normalized_win_probs = np.full_like(win_probs, 0.5)
    
    # Create a dictionary mapping each move to its normalized win probability.
    reward_dict = {m: float(p) for m, p in zip(arr[:, 0], normalized_win_probs)}
    return reward_dict


# Functions to process / output our prompt from the initial data in the master csv
SYSTEM_PROMPT = """<|im_start|>system
You are a smart, strategic, and wise chess reasoning model. You are currently in a chess tournament where you have 1 minute to make a move.

We will provide you with a board in Forsyth-Edwards Notation (FEN) and a list of legal moves. Your task is to reason through the board state and determine an optimal move based on your analysis.

The reasoning process and answer must be enclosed within <think> </think> and <answer> </answer> tags, respectively. For example, when given an input prefixed with "user:", your response should be in the format "assistant: <think> [your reasoning] </think> <answer> [chosen move] </answer>".

Below is an example of your desired behavior:

Example 1:
user: <FEN> 7R/4n1k1/4P3/1pp2B2/8/6P1/2r4P/6K1 w - - 3 50 </FEN> <legal moves> [f5h7, h8h6, h2h4, h8g8, h2h3, g1f1, f5d3, f5h3, h8b8, h8h4, h8c8, h8f8, h8a8, h8d8, f5g4, h8h3, g3g4, g1h1, f5e4, h8h5, f5c2, h8e8, f5g6, h8h7] </legal moves>
assistant: <think> Playing as white, I'm in the offensive here. My rook is currently in at risk of being taken by their king and my bishop is at risk of being taken by their knight. I could take their rook with their bishop but they would take my rook. However, if I move my rook to h7, I'll put their king in check while saving my rook and bishop and continue pressure. Moving rook h8 to h7 is a wise move. </think> <answer> h8h7 </answer>

Make sure that your chosen move is in standard chess notation (such as 'g8f7' -- which means you move the piece from g8 to f7). 

Use English for your thought process. Remember you have one minute to move so be quick.<|im_end|>"""

def format_prompt(board: str, legal_moves: List[str]) -> str:
    """
    Formats the board and legal moves into a prompt for the model.
    
    Args:
        board (str): The current board state.
        legal_moves (List[str]): The list of legal moves.
    
    Returns:
        str: The formatted prompt.
    """
    random.shuffle(legal_moves)
    prompt = f"<|im_start|>user: <FEN> {board} </FEN> <legalmoves> {legal_moves} </legalmoves><|im_end|>\n<|im_start|>assistant: "
    prompt = prompt.replace("'", "")
    return SYSTEM_PROMPT + '\n' + prompt

In [4]:
# Code to load and process our data -- saves as parquet
experiment_name = "old_system_prompt"
max_samples = 5000   # Let's use 5k max samples for now; no need to do a val set for now as well
test_samples = 256   # Keep as is -- want this smol

# Apply transformation to train and test datasets
# If we want to be super careful we can actually pre-split into train / val to make sure no contamination but imo val results not super important but we need something small
train_df = _load_challenge_moves_csv("chess_challenges_full.csv", shuffle=True, max_samples=max_samples)
train_dataset = train_df.apply(lambda row: process_fn(row, row.name, "train"), axis=1)
train_dataset = pd.DataFrame(train_dataset.tolist())
test_df = _load_challenge_moves_csv("chess_challenges_full.csv", shuffle=True, max_samples=test_samples)
test_dataset = test_df.apply(lambda row: process_fn(row, row.name, "train"), axis=1)
test_dataset = pd.DataFrame(test_dataset.tolist())

# Need to fix due to parquet screwing up dicts
train_dataset["reward_model"] = train_dataset["reward_model"].apply(json.dumps)
test_dataset["reward_model"] = test_dataset["reward_model"].apply(json.dumps)

# Save our parquets down
train_parquet_filename = f"train-{experiment_name}-{max_samples//1000}k.parquet"
test_parquet_filename = f"test-{experiment_name}-{test_samples}.parquet"
train_dataset.to_parquet(os.path.join(DATA_ROOT, "parquet_datasets", train_parquet_filename), index=False)
test_dataset.to_parquet(os.path.join(DATA_ROOT, "parquet_datasets", test_parquet_filename), index=False)