# Inspecting the dataset


https://huggingface.co/datasets/Salesforce/APIGen-MT-5k

In [1]:
from datasets import load_dataset

ds = load_dataset("Salesforce/APIGen-MT-5k")
ds

README.md: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/5000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['conversations', 'tools', 'system'],
        num_rows: 5000
    })
})

In [2]:
ds['train'][0]

{'conversations': [{'from': 'human',
   'value': 'Hello, I have a request regarding a past flight reservation, 0U4NPP, from PHL to DEN. I was wondering if I could still cancel it due to personal reasons.'},
  {'from': 'gpt',
   'value': 'To assist you with the cancellation request, I need to verify a few details:\n\n1. Your user ID.\n2. The reason for cancellation (change of plan, airline cancelled flight, or other reasons).\n\nPlease provide this information so I can check the eligibility for cancellation.'},
  {'from': 'human',
   'value': "Sure. My user ID is amelia_rossi_1651. The reason for the cancellation request is personal. I understand if it's an unusual request as the flights have already landed, but I thought it was worth checking if there's any flexibility."},
  {'from': 'function_call',
   'value': '{"name": "get_reservation_details", "arguments": {"reservation_id": "0U4NPP"}}'},
  {'from': 'observation',
   'value': '{"reservation_id": "0U4NPP", "user_id": "amelia_rossi_

In [3]:
import ipywidgets as widgets
from IPython.display import display, HTML
import json

def format_message(msg):
    """Format a message based on its role"""
    role = msg['from']
    value = msg['value']
    
    # Color scheme for different roles
    colors = {
        'human': {'bg': '#007bff', 'text': 'white', 'align': 'right', 'label': 'Human'},
        'gpt': {'bg': '#e9ecef', 'text': 'black', 'align': 'left', 'label': 'Assistant'},
        'function_call': {'bg': '#ffc107', 'text': 'black', 'align': 'left', 'label': 'Function Call'},
        'observation': {'bg': '#28a745', 'text': 'white', 'align': 'left', 'label': 'Observation'}
    }
    
    style = colors.get(role, {'bg': '#6c757d', 'text': 'white', 'align': 'left', 'label': role})
    
    # Try to pretty-print JSON if it's a function call or observation
    if role in ['function_call', 'observation'] and value:
        try:
            parsed = json.loads(value)
            if role == 'function_call':
                formatted_value = f"<strong>{parsed.get('name', 'unknown')}</strong><br><pre style='margin: 5px 0; font-size: 0.9em;'>{json.dumps(parsed.get('arguments', {}), indent=2)}</pre>"
            else:
                formatted_value = f"<pre style='margin: 0; font-size: 0.9em;'>{json.dumps(parsed, indent=2)}</pre>"
        except:
            formatted_value = value.replace('\n', '<br>')
    else:
        formatted_value = value.replace('\n', '<br>')
    
    bubble_style = f"""
        background-color: {style['bg']};
        color: {style['text']};
        padding: 10px 15px;
        border-radius: 18px;
        margin: 8px 0;
        max-width: 70%;
        word-wrap: break-word;
        display: inline-block;
        text-align: {style['align']};
        box-shadow: 0 1px 2px rgba(0,0,0,0.1);
    """
    
    label_style = f"""
        font-size: 0.75em;
        font-weight: bold;
        margin-bottom: 4px;
        opacity: 0.8;
    """
    
    if style['align'] == 'right':
        container_style = "text-align: right; margin: 8px 0;"
    else:
        container_style = "text-align: left; margin: 8px 0;"
    
    return f"""
    <div style="{container_style}">
        <div style="{label_style}">{style['label']}</div>
        <div style="{bubble_style}">
            {formatted_value}
        </div>
    </div>
    """

def display_conversation(index):
    """Display a conversation at the given index"""
    conversation = ds['train'][index]['conversations']
    
    html_content = """
    <div style="
        font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
        padding: 20px;
        background-color: #f8f9fa;
        border-radius: 10px;
        max-height: 600px;
        overflow-y: auto;
    ">
    """
    
    for msg in conversation:
        html_content += format_message(msg)
    
    html_content += "</div>"
    
    return HTML(html_content)

def create_ui():
    """Create the main UI"""
    # Create dropdown for selecting conversation
    index_slider = widgets.IntSlider(
        value=0,
        min=0,
        max=len(ds['train']) - 1,
        step=1,
        description='Index:',
        style={'description_width': 'initial'},
        layout=widgets.Layout(width='500px')
    )
    
    # Create output area
    output = widgets.Output()
    
    def update_conversation(change):
        with output:
            output.clear_output()
            display(display_conversation(change['new']))
    
    index_slider.observe(update_conversation, names='value')
    
    # Initial display
    with output:
        display(display_conversation(0))
    
    # Create UI container
    ui = widgets.VBox([
        widgets.HTML("<h2>Conversation Viewer</h2>"),
        index_slider,
        widgets.HTML(f"<p>Total conversations: {len(ds['train'])}</p>"),
        output
    ])
    
    return ui

# Display the UI
display(create_ui())

VBox(children=(HTML(value='<h2>Conversation Viewer</h2>'), IntSlider(value=0, description='Index:', layout=Lay…

# Formatting for training

Each model has its own syntax

Qwen3: https://huggingface.co/Qwen/Qwen3-1.7B?chat_template=default  
Gemma: https://ai.google.dev/gemma/docs/capabilities/function-calling  
Llama 3.1: https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_1/#-tool-calling-(8b/70b/405b)-  
Llama 3.3: https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_3/#-zero-shot-function-calling-  
Llama 4: https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/#-zero-shot-function-calling---python-format-

GLM 4.7 uses interleaved thinking, which is different than what Qwen3 does. Qwen3 removes historical thinking in a multiturn conversation. GLM 4.7 keeps historical thinking: https://docs.z.ai/guides/capabilities/thinking-mode


Note: This dataset has no thinking involved, and nowadays most models do better when they are able to think for a bit before making a tool call. It may be worthwhile to use a larger model to create reasoning traces. For Qwen, only the last assistant message is allowed to contain thinking - all other reasoning content will be deleted from the templated input. Adding thinking for Qwen3 would look like this:

```python
last_assistant_message = {"role": "assistant", "content": "The capital of Germany is Berlin.", "reasoning_content": "Okay, let me think about this. The simple answer is Berlin and it doesn't look like there any other details. Will respond now."}

messages = [
    {"role": "user", "content": "What is the capital of Germany?"},
    last_assistant_message
]

print(tokenizer.apply_chat_template(messages, tokenize=False))
```

```text
<|im_start|>user
What is the capital of Germany?<|im_end|>
<|im_start|>assistant
<think>
Okay, let me think about this. The simple answer is Berlin and it doesn't look like there any other details. Will respond now.
</think>

The capital of Germany is Berlin.<|im_end|>
```

In [4]:
all_roles = set()

for c in ds['train']["conversations"]:
    for message in c:
        all_roles.add(message['from'])

all_roles

{'function_call', 'gpt', 'human', 'observation'}

In [5]:
import json
from transformers import AutoTokenizer

role_mapping = {
    "function_call": "assistant",
    "observation": "tool",
    "gpt": "assistant",
    "human": "user"
}

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-1.7B")


def format_for_qwen3(sample):
    conversations = sample["conversations"]
    tools = json.loads(sample["tools"])
    system = sample["system"]


    messages = [
        {"role": "system", "content": system},
    ]

    for c in conversations:
        role = role_mapping[c["from"]]

        if c["from"] == "function_call":
            f = json.loads(c["value"])
            if not isinstance(f, list):
                f = [f]
            messages.append({
                "role": role,
                "content": "",
                "tool_calls": f
            })
        else:
            messages.append({
                "role": role,
                "content": c["value"]
            })

    return {
        "text": tokenizer.apply_chat_template(messages, tokenize=False, tools=tools),
    }


ds = ds.map(format_for_qwen3, batched=False, num_proc=16)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Map (num_proc=16):   0%|          | 0/5000 [00:00<?, ? examples/s]

# Creating labels


Masking all tokens except for those that are generated by the model (assistant messages)

In [6]:
# Printing a sample conversation to gain an understanding of how the chat template works

sample_messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Hi there"},
    {"role": "assistant", "content": "Hello"},
    {"role": "user", "content": "What is the capital of Germany?"},
    {"role": "assistant", "content": "The capital of Germany is Berlin."},
]

sample_ids = tokenizer.apply_chat_template(sample_messages, tokenize=True)

print(tokenizer.decode(sample_ids, skip_special_tokens=False))

print(list(zip(sample_ids, tokenizer.convert_ids_to_tokens(sample_ids))))

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Hi there<|im_end|>
<|im_start|>assistant
Hello<|im_end|>
<|im_start|>user
What is the capital of Germany?<|im_end|>
<|im_start|>assistant
<think>

</think>

The capital of Germany is Berlin.<|im_end|>

[(151644, '<|im_start|>'), (8948, 'system'), (198, 'Ċ'), (2610, 'You'), (525, 'Ġare'), (264, 'Ġa'), (10950, 'Ġhelpful'), (17847, 'Ġassistant'), (13, '.'), (151645, '<|im_end|>'), (198, 'Ċ'), (151644, '<|im_start|>'), (872, 'user'), (198, 'Ċ'), (13048, 'Hi'), (1052, 'Ġthere'), (151645, '<|im_end|>'), (198, 'Ċ'), (151644, '<|im_start|>'), (77091, 'assistant'), (198, 'Ċ'), (9707, 'Hello'), (151645, '<|im_end|>'), (198, 'Ċ'), (151644, '<|im_start|>'), (872, 'user'), (198, 'Ċ'), (3838, 'What'), (374, 'Ġis'), (279, 'Ġthe'), (6722, 'Ġcapital'), (315, 'Ġof'), (9856, 'ĠGermany'), (30, '?'), (151645, '<|im_end|>'), (198, 'Ċ'), (151644, '<|im_start|>'), (77091, 'assistant'), (198, 'Ċ'), (151667, '<think>'), (271, 'ĊĊ'), (151

In [7]:
IM_START = 151644
IM_END = 151645

SYSTEM_TOKEN_ID = 8948
USER_TOKEN_ID = 872
ASSISTANT_TOKEN_ID = 77091

ROLE_TOKENS = {
    'system': SYSTEM_TOKEN_ID,
    'user': USER_TOKEN_ID,
    'assistant': ASSISTANT_TOKEN_ID
}

In [8]:
def create_labels_with_assistant_only(input_ids):
    """
    Create labels for training only on assistant responses.
    
    Args:
        input_ids: List of token IDs from tokenizer.apply_chat_template()
    
    Returns:
        labels: List where assistant tokens keep their original IDs and others are -100
    """
    
    labels = []
    current_role = None
    top_level_role = None  # Track the top-most role context
    i = 0
    
    while i < len(input_ids):
        token_id = input_ids[i]
        
        if token_id == IM_START:
            # Start of a message - mask this token
            labels.append(-100)
            
            # Check if next token is a role
            if i + 1 < len(input_ids):
                next_token = input_ids[i + 1]
                
                # Determine the role from the next token
                new_role = None
                for role_name, role_token_id in ROLE_TOKENS.items():
                    if next_token == role_token_id:
                        new_role = role_name
                        break
                
                # If we're not already inside a message, this becomes our top-level role
                if top_level_role is None:
                    top_level_role = new_role
                    current_role = new_role
                # If we're already inside a message, we're in nested content
                # Keep the current masking context
                
        elif token_id == IM_END:
            # Unmask IM_END token if we're closing an assistant message
            if top_level_role == 'assistant' or current_role == 'assistant':
                labels.append(token_id)  # Don't mask - keep original token
            else:
                labels.append(-100)  # Mask
            
            # End of current message
            # If this ends a top-level message, clear the context
            if current_role == top_level_role:
                top_level_role = None
                current_role = None
            
            
                
        elif token_id in ROLE_TOKENS.values():
            # This is a role token (system, user, or assistant) - always mask it
            labels.append(-100)
        else:
            # Normal content token logic
            # Only unmask if we're in a top-level assistant message
            if top_level_role == 'assistant' and current_role == 'assistant':
                labels.append(token_id)  # Don't mask
            else:
                labels.append(-100)  # Mask
    
        i += 1
    
    return labels


tokenized_ds = ds['train'].map(lambda x: {"input_ids": tokenizer.encode(x['text'], add_special_tokens=False)}, batched=False, num_proc=16)
tokenized_ds = tokenized_ds.map(lambda x: {"attention_mask": [1] * len(x["input_ids"])}, batched=False, num_proc=16)
tokenized_ds = tokenized_ds.map(lambda x: {"labels": create_labels_with_assistant_only(x["input_ids"])}, batched=False, num_proc=16)

Map (num_proc=16):   0%|          | 0/5000 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/5000 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/5000 [00:00<?, ? examples/s]

## Visualize the masking

In [9]:
def visualize_masking(input_ids, labels):
    """
    Visualize token masking using ANSI colors.
    
    Args:
        input_ids: List of token IDs
        labels: List of labels (-100 for masked, token_id for unmasked)
        tokenizer: Optional tokenizer to convert IDs to tokens
        tokens: Optional list of token strings (if tokenizer not provided)
    """
    # ANSI color codes
    RED = '\033[31m'      # Masked tokens
    GREEN = '\033[32m'    # Unmasked tokens
    RESET = '\033[0m'     # Reset color
    BOLD = '\033[1m'      # Bold text

    
    print(f"\n{BOLD}Token Masking Visualization{RESET}")
    print(f"{RED}Red = Masked (-100){RESET}, {GREEN}Green = Unmasked (tokens to train on){RESET}")
    print("-" * 80)
    

    line_tokens = []
    line_length = 0

    tokens = [tokenizer.decode(x) for x in input_ids]
    
    for token, label in zip(tokens, labels):
        # Choose color based on masking
        if label == -100:
            colored_token = f"{RED}{token}{RESET}"
        else:
            colored_token = f"{GREEN}{token}{RESET}"
        

        token_info = f"{colored_token}"

        line_tokens.append(token_info)
        line_length += len(token)
    

    if line_tokens:
        print(''.join(line_tokens))

In [10]:
x = tokenized_ds.shuffle()[0]

visualize_masking(x["input_ids"], x["labels"])


[1mToken Masking Visualization[0m
[31mRed = Masked (-100)[0m, [32mGreen = Unmasked (tokens to train on)[0m
--------------------------------------------------------------------------------
[31m<|im_start|>[0m[31msystem[0m[31m
[0m[31m#[0m[31m Retail[0m[31m agent[0m[31m policy[0m[31m
[0m[31mAs[0m[31m a[0m[31m retail[0m[31m agent[0m[31m,[0m[31m you[0m[31m can[0m[31m help[0m[31m users[0m[31m cancel[0m[31m or[0m[31m modify[0m[31m pending[0m[31m orders[0m[31m,[0m[31m return[0m[31m or[0m[31m exchange[0m[31m delivered[0m[31m orders[0m[31m,[0m[31m modify[0m[31m their[0m[31m default[0m[31m user[0m[31m address[0m[31m,[0m[31m or[0m[31m provide[0m[31m information[0m[31m about[0m[31m their[0m[31m own[0m[31m profile[0m[31m,[0m[31m orders[0m[31m,[0m[31m and[0m[31m related[0m[31m products[0m[31m.
[0m[31m-[0m[31m At[0m[31m the[0m[31m beginning[0m[31m of[0m[31m the[0m[31m conversation[0m

# Saving to parquet

In [11]:
cols2keep = ["input_ids", "labels", "attention_mask"]
cols = tokenized_ds.column_names
final_ds = tokenized_ds.remove_columns(set(cols) - set(cols2keep))

split_final_ds = final_ds.train_test_split(test_size=0.1)


train_filename = "apigen-qwen3-1.7b-5k-train.parquet"
test_filename = "apigen-qwen3-1.7b-5k-test.parquet"

split_final_ds["train"].to_parquet(train_filename)
split_final_ds["test"].to_parquet(test_filename)

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

42486165

# Starting training job using Together AI Finetuning API

In [13]:
from together import Together
import os
from dotenv import load_dotenv

loaded = load_dotenv("../.env", override=True)
if not loaded:
    raise ValueError("Failed to load .env file")

together_client = Together(api_key=os.environ["TOGETHER_API_KEY"])

In [14]:
from together.utils import check_file

check_file(train_filename), check_file(test_filename)

({'is_check_passed': True,
  'message': 'Checks passed',
  'found': True,
  'file_size': 35406094,
  'utf8': None,
  'line_type': None,
  'text_field': None,
  'key_value': None,
  'has_min_samples': None,
  'num_samples': 4500,
  'load_json': None,
  'load_csv': None,
  'filetype': 'parquet'},
 {'is_check_passed': True,
  'message': 'Checks passed',
  'found': True,
  'file_size': 3473763,
  'utf8': None,
  'line_type': None,
  'text_field': None,
  'key_value': None,
  'has_min_samples': None,
  'num_samples': 500,
  'load_json': None,
  'load_csv': None,
  'filetype': 'parquet'})

In [15]:
train_upload_details = together_client.files.upload(train_filename)
test_upload_details = together_client.files.upload(test_filename)

print(train_upload_details)
print(test_upload_details)

Uploading file apigen-qwen3-1.7b-5k-train.parquet: 100%|██████████| 35.4M/35.4M [00:01<00:00, 27.2MB/s]
Uploading file apigen-qwen3-1.7b-5k-test.parquet: 100%|██████████| 3.47M/3.47M [00:00<00:00, 11.8MB/s]


id='file-d6ba3d9f-f072-4269-a0c1-14b9a83dc570' object='file' created_at=1767743632 type=None purpose=<FilePurpose.FineTune: 'fine-tune'> filename='apigen-qwen3-1.7b-5k-train.parquet' bytes=35406094 line_count=0 processed=True FileType='parquet' project_id='proj_CLMvkn3oTfrK2ice84mzd' organization_id='org_CLMvkn2JoTwWdDdZH3JFt'
id='file-9d10c3ee-b9d3-4a77-a004-c1d2440f5a59' object='file' created_at=1767743634 type=None purpose=<FilePurpose.FineTune: 'fine-tune'> filename='apigen-qwen3-1.7b-5k-test.parquet' bytes=3473763 line_count=0 processed=True FileType='parquet' project_id='proj_CLMvkn3oTfrK2ice84mzd' organization_id='org_CLMvkn2JoTwWdDdZH3JFt'


# Starting finetuning job

In [16]:
ft_resp = together_client.fine_tuning.create(
    training_file=train_upload_details.id,
    validation_file=test_upload_details.id,
    model="Qwen/Qwen3-1.7B",
    n_epochs=1,
    n_evals=1,
    n_checkpoints=1,
    lora=True, 
    warmup_ratio=0.1,
    learning_rate=5e-5,
    suffix="apigen-5k",
    train_on_inputs="auto",
    wandb_api_key=os.environ.get("WANDB_API_KEY"),
    wandb_project_name="apigen-ft"
)

In [None]:
resp = together_client.fine_tuning.retrieve(ft_resp.id)
print(resp)