# Data Generation and SFT Tutorial

In this tutorial we showcase an example of how we generate tasks --> generate teacher traces --> perform SFT

# Data Generation

In [1]:
import os
import sys
import json
import tempfile
import subprocess
from pathlib import Path
from typing import List, Dict, Any

from data.commons import upload_tasks_to_hf, upload_traces_to_hf
from scripts.harbor.run_and_export_traces import run_dataset_to_traces

In [2]:
def clone_inferredbugs():
    """Clone InferredBugs repository to temp directory"""
    temp_dir = Path(tempfile.mkdtemp(prefix="inferredbugs_"))
    repo_path = temp_dir / "InferredBugs"
    
    cmd = ["git", "clone", "https://github.com/microsoft/InferredBugs.git", str(repo_path)]
    subprocess.run(cmd, check=True, capture_output=True)
    
    return repo_path

repo_path = clone_inferredbugs()
print(f"Cloned to: {repo_path}")

Cloned to: /tmp/user/24984/inferredbugs_etq4t4ac/InferredBugs


In [3]:
def find_bugs(repo_path: Path) -> List[Dict[str, Any]]:
    """Find all bug directories in csharp and java folders"""
    bugs = []
    
    for language in ["csharp", "java"]:
        lang_path = repo_path / "inferredbugs" / language
        if not lang_path.exists():
            continue
        
        for root, _, files in os.walk(lang_path):
            if "bug.json" not in files:
                continue
                
            bug_json_path = Path(root) / "bug.json"
            try:
                with open(bug_json_path, 'r', encoding='utf-8') as f:
                    bug_data = json.load(f)
                
                # Read before/after code if available
                additional = {}
                for fname in ["file_before.txt", "file_after.txt", "method_before.txt", "method_after.txt"]:
                    fpath = Path(root) / fname
                    if fpath.exists():
                        with open(fpath, 'r', encoding='utf-8') as f:
                            additional[fname.replace('.txt', '')] = f.read()
                
                bugs.append({
                    'path': str(root),
                    'language': language,
                    'bug_data': bug_data,
                    'additional_data': additional,
                    'project_name': Path(root).parent.name,
                    'bug_id': Path(root).name
                })
            except Exception:
                continue
    
    return bugs

bugs = find_bugs(repo_path)
print(f"Found {len(bugs)} bugs")

Found 11186 bugs


In [4]:
def create_instruction(bug: Dict[str, Any]) -> str:
    """Format bug data into a task instruction"""
    bug_data = bug['bug_data']
    additional = bug.get('additional_data', {})
    
    instruction = f"""# Bug Fix Task - {bug['language'].title()}

**Project:** {bug['project_name']}
**Bug ID:** {bug['bug_id']}

"""
    
    if 'description' in bug_data:
        instruction += f"**Description:** {bug_data['description']}\n\n"
    
    if 'file_before' in additional and 'file_after' in additional:
        instruction += f"### Code (Buggy Version):\n```{bug['language']}\n{additional['file_before']}\n```\n\n"
    
    instruction += "## Task\nFix the bug in the code.\n"
    
    return instruction

instructions = [create_instruction(bug) for bug in bugs]
print(f"Created {len(instructions)} instructions")

Created 11186 instructions


In [5]:
def generate_tasks(instructions: List[str], dataset_name: str = "inferredbugs") -> Path:
    """Generate Harbor-compatible task directories"""
    from data.commons import generate_tasks_from_questions
    task_dir = generate_tasks_from_questions(instructions, dataset_name)
    return Path(task_dir)

task_dir = generate_tasks(instructions, "inferredbugs")
print(f"Tasks saved to: {task_dir}")

Generating tasks in: /tmp/user/24984/inferredbugs_tasks_7r0jqr3r


11186it [00:02, 3999.65it/s]

Generated 11186 tasks successfully!
Tasks saved to: /tmp/user/24984/inferredbugs_tasks_7r0jqr3r





In [None]:
def subsample_tasks(task_dir: Path, max_tasks: int = 100) -> Path:
    """Randomly subsample tasks for faster iteration"""
    from data.commons import subsample_tasks_directory
    subsampled = subsample_tasks_directory(str(task_dir), max_tasks)
    return Path(subsampled)

NUM_SAMPLES = 100 # TODO: Fill in the number of tasks to subsample to
subsampled_task_dir = subsample_tasks(task_dir, max_tasks=NUM_SAMPLES)
print(f"Task Directory: {subsampled_task_dir}")

GCS cache disabled for subsample_tasks_directory; falling back to no-op caching: Your default credentials were not found. To set up Application Default Credentials, see https://cloud.google.com/docs/authentication/external/set-up-adc for more information.
Creating subsampled tasks in: /tmp/user/24984/subsampled_g4l9mbqy
Successfully subsampled 10 tasks from 11186 total tasks!
Task Directory: /tmp/user/24984/subsampled_g4l9mbqy


In [None]:
# NOTE: Fill in your API keys below
# os.environ["DAYTONA_API_KEY"] = ""
# os.environ["OPENAI_API_KEY"] = ""

In [7]:
# Fix for Jupyter notebook event loop conflict
import nest_asyncio
nest_asyncio.apply()

In [None]:
# Generate Teacher Traces

def run_traces(
    task_dir: Path,
    model_name: str = "gpt-5-mini",
    agent_name: str = "terminus-2",
    n_concurrent: int = 8
):
    """Run agent trials and collect traces"""
    dataset = run_dataset_to_traces(
        str(task_dir),
        model_name=model_name,
        agent_name=agent_name,
        n_concurrent=n_concurrent,
        agent_kwargs={"max_episodes": 3},
        disable_verification=True
    )
    return dataset

traces_dataset = run_traces(subsampled_task_dir, n_concurrent=8)
print(f"Collected {len(traces_dataset)} traces")

In [None]:
# Upload tasks
tasks_repo = "" # TODO: Fill in the HF tasks repo name
upload_tasks_to_hf(str(subsampled_task_dir), tasks_repo)
print(f"✓ Tasks uploaded to {tasks_repo}")

# Upload traces
traces_repo = "" # TODO: Fill in the HF traces repo name
upload_traces_to_hf(traces_dataset, traces_repo, "SFT")
print(f"✓ Traces uploaded to {traces_repo}")

# SFT

In [1]:
import os
import sys
import yaml
from pathlib import Path
from huggingface_hub import snapshot_download

In [None]:
DATASET = "" # TODO: Fill in the dataset name specified in the data generation step
MODEL = "Qwen/Qwen2.5-0.5B-Instruct"  # Small model for demo
OUTPUT_DIR = "outputs/inferredbugs_demo"

# Training hyperparameters
# DeepSpeed config is not specified for this demo
config = {
    "model_name_or_path": MODEL,
    "trust_remote_code": True,
    "attn": "fa2",
    "enable_liger_kernel": True,
    "optim": "adamw_torch_fused",
    
    # Method
    "stage": "sft",
    "do_train": True,
    "finetuning_type": "full",
    "include_mfu": True,
    
    # Dataset
    "dataset": DATASET,
    "dataset_dir": "ONLINE",  # Use online HuggingFace dataset
    "template": "qwen",
    "cutoff_len": 2048, # A small cutoff is used for demo purposes, for more effective training a larger cutoff should be used
    "overwrite_cache": True,
    "preprocessing_num_workers": 16,
    "formatting": "sharegpt",
    "messages": "conversations",
    "role_tag": "role",
    "content_tag": "content",
    "user_tag": "user",
    "assistant_tag": "assistant",
    
    # Training params
    "per_device_train_batch_size": 1,
    "gradient_accumulation_steps": 1,
    "learning_rate": 4.0e-5,
    "num_train_epochs": 3.0,
    "max_grad_norm": 1.0e-4,
    "lr_scheduler_type": "cosine",
    "warmup_ratio": 0.1,
    "bf16": True,
    "gradient_checkpointing": True,
    
    # Output
    "output_dir": OUTPUT_DIR,
    "logging_steps": 1,
    "save_strategy": "steps",
    "save_steps": 100,
    "save_total_limit": 1,
    "plot_loss": True,
}

print(f"Dataset: {DATASET}")
print(f"Model: {MODEL}")
print(f"Output: {OUTPUT_DIR}")

In [None]:
model_path = snapshot_download(repo_id=MODEL, repo_type="model")
print(f"Model downloaded to: {model_path}")

dataset_path = snapshot_download(repo_id=DATASET, repo_type="dataset")
print(f"Dataset downloaded to: {dataset_path}")

# Update config with downloaded paths
config["model_name_or_path"] = model_path
config["dataset"] = dataset_path

In [None]:
os.makedirs(OUTPUT_DIR, exist_ok=True)
config_path = f"{OUTPUT_DIR}/train_config.yaml"

with open(config_path, 'w') as f:
    yaml.dump(config, f, default_flow_style=False)

print(f"Config saved to: {config_path}")

In [6]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
# Add llamafactory to path (go up one directory from notebook/)
llamafactory_path = os.path.abspath("../sft/llamafactory/src")
if llamafactory_path not in sys.path:
    sys.path.insert(0, llamafactory_path)

print(f"Added to path: {llamafactory_path}")
print("\nStarting training...")
print("=" * 70)

# Set sys.argv for llamafactory (use absolute path)
sys.argv = ["train.py", os.path.abspath(config_path)]

from llamafactory.train.tuner import run_exp
run_exp()

print("=" * 70)
print("Training completed!")