In [4]:
from datasets import load_dataset

ds = load_dataset("nbroad/apigen-with-thinking-1.5k")['train']
ds

Dataset({
    features: ['messages', 'tools'],
    num_rows: 1500
})

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-14B")

In [8]:
import json


def tokenize(sample):
    full_ids = tokenizer.apply_chat_template(
        json.loads(sample["messages"]), tokenize=True, tools=json.loads(sample["tools"])
    )
    prefix_ids = tokenizer.apply_chat_template(
        json.loads(sample["messages"])[:-1], tokenize=True, tools=json.loads(sample["tools"]), add_generation_prompt=True
    )

    # Masking all tokens from loss except for the last assistant message
    labels = [-100] * len(prefix_ids) + full_ids[len(prefix_ids):]

    assert len(full_ids) == len(labels)


    return {
        "input_ids": full_ids,
        "attention_mask": [1] * len(full_ids),
        "labels": labels
    }


tokenized_ds = ds.map(tokenize, batched=False, num_proc=16)

Map (num_proc=16):   0%|          | 0/1500 [00:00<?, ? examples/s]

In [9]:
def visualize_masking(input_ids, labels):
    """
    Visualize token masking using ANSI colors.
    
    Args:
        input_ids: List of token IDs
        labels: List of labels (-100 for masked, token_id for unmasked)
        tokenizer: Optional tokenizer to convert IDs to tokens
        tokens: Optional list of token strings (if tokenizer not provided)
    """
    # ANSI color codes
    RED = '\033[31m'      # Masked tokens
    GREEN = '\033[32m'    # Unmasked tokens
    RESET = '\033[0m'     # Reset color
    BOLD = '\033[1m'      # Bold text

    
    print(f"\n{BOLD}Token Masking Visualization{RESET}")
    print(f"{RED}Red = Masked (-100){RESET}, {GREEN}Green = Unmasked (tokens to train on){RESET}")
    print("-" * 80)
    

    line_tokens = []
    line_length = 0

    tokens = [tokenizer.decode(x) for x in input_ids]
    
    for token, label in zip(tokens, labels):
        # Choose color based on masking
        if label == -100:
            colored_token = f"{RED}{token}{RESET}"
        else:
            colored_token = f"{GREEN}{token}{RESET}"
        

        token_info = f"{colored_token}"

        line_tokens.append(token_info)
        line_length += len(token)
    

    if line_tokens:
        print(''.join(line_tokens))

In [10]:
x = tokenized_ds.shuffle()[0]

visualize_masking(x["input_ids"], x["labels"])


[1mToken Masking Visualization[0m
[31mRed = Masked (-100)[0m, [32mGreen = Unmasked (tokens to train on)[0m
--------------------------------------------------------------------------------
[31m<|im_start|>[0m[31msystem[0m[31m
[0m[31m#[0m[31m Retail[0m[31m agent[0m[31m policy[0m[31m
[0m[31mAs[0m[31m a[0m[31m retail[0m[31m agent[0m[31m,[0m[31m you[0m[31m can[0m[31m help[0m[31m users[0m[31m cancel[0m[31m or[0m[31m modify[0m[31m pending[0m[31m orders[0m[31m,[0m[31m return[0m[31m or[0m[31m exchange[0m[31m delivered[0m[31m orders[0m[31m,[0m[31m modify[0m[31m their[0m[31m default[0m[31m user[0m[31m address[0m[31m,[0m[31m or[0m[31m provide[0m[31m information[0m[31m about[0m[31m their[0m[31m own[0m[31m profile[0m[31m,[0m[31m orders[0m[31m,[0m[31m and[0m[31m related[0m[31m products[0m[31m.
[0m[31m-[0m[31m At[0m[31m the[0m[31m beginning[0m[31m of[0m[31m the[0m[31m conversation[0m