# Fine-tuning Qwen 3 on a Custom Dataset

## Imports and setup

In [1]:
%%capture
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth
!pip install regex transformers rich

In [2]:
import unsloth
import torch
from unsloth import FastModel
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
from transformers import TextStreamer, GenerationConfig
import re

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.6.0+cu124 with CUDA 1204 (you have 2.8.0.dev20250319+cu128)
    Python  3.11.11 (you have 3.11.11)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


🦥 Unsloth Zoo will now patch everything to make training faster!


## Load model and tokenizer

In [3]:
print("Loading Qwen3 model and tokenizer...")
model, tokenizer = FastModel.from_pretrained(
    model_name="unsloth/Qwen3-14B",
    max_seq_length=2048,  # Choose any for long context
    load_in_4bit=True,  # 4 bit quantization to reduce memory
    full_finetuning=False,
)

Loading Qwen3 model and tokenizer...
==((====))==  Unsloth 2025.5.2: Fast Qwen3 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA L40S. Num GPUs = 1. Max memory: 44.521 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0.dev20250319+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

## Load and tokenize dataset

In [4]:
print("Loading Bullet Echo Wiki QA dataset...")
dataset_name = "bexgboost/bullet-echo-wiki-qa"
full_dataset = load_dataset(dataset_name, trust_remote_code=True)

# Split dataset into training and validation sets (90% train, 10% validation)
train_val_split = full_dataset["train"].train_test_split(
    test_size=0.1, seed=42, shuffle=True
)
train_dataset = train_val_split["train"]
val_dataset = train_val_split["test"]  # This becomes our validation set

print(
    f"Training examples: {len(train_dataset)}, Validation examples: {len(val_dataset)}"
)


Loading Bullet Echo Wiki QA dataset...
Training examples: 2711, Validation examples: 302


In [5]:
print("Formatting datasets with Qwen3 chat template...")
EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN


def format_data(example):
    # Qwen3 uses a chat template, so we'll format it accordingly
    messages = [
        {"role": "user", "content": example["question"]},
        {"role": "assistant", "content": example["answer"] + EOS_TOKEN},
    ]
    # The tokenizer.apply_chat_template handles special tokens for Qwen3
    return {"text": tokenizer.apply_chat_template(messages, tokenize=False)}


# Format both training and validation datasets
formatted_train_dataset = train_dataset.map(format_data)
formatted_val_dataset = val_dataset.map(format_data)

Formatting datasets with Qwen3 chat template...


In [6]:
print("Tokenizing datasets...")


def tokenize_function(examples):
    # padding=False because SFTTrainer will handle padding
    return tokenizer(
        examples["text"],
        padding=False,
        truncation=True,
        max_length=model.config.max_position_embeddings,
    )


# Process both datasets
processed_train_dataset = formatted_train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["id", "question", "answer", "text"],
    desc="Tokenizing training dataset",
)

processed_val_dataset = formatted_val_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["id", "question", "answer", "text"],
    desc="Tokenizing validation dataset",
)

Tokenizing datasets...


Tokenizing training dataset:   0%|          | 0/2711 [00:00<?, ? examples/s]

Tokenizing validation dataset:   0%|          | 0/302 [00:00<?, ? examples/s]

## Setup PEFT Model

In [7]:
print("Setting up PEFT model with LoRA...")
model = FastModel.get_peft_model(
    model,
    r=8,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    finetune_vision_layers=False,  # Turn off for just text!
    finetune_language_layers=True,
    finetune_attention_modules=True,
    finetune_mlp_modules=True,
    lora_alpha=8,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=1000,
    use_rslora=False,
)


Setting up PEFT model with LoRA...
Unsloth: Making `model.base_model.model.model` require gradients


## Train the model

In [8]:
print("Configuring SFTTrainer with evaluation...")
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_val_dataset,  # Add validation dataset
    args=SFTConfig(
        dataset_text_field="text",
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,  # Batch size for evaluation
        gradient_accumulation_steps=4,
        warmup_steps=5,
        num_train_epochs=3,
        # max_steps=100,  # For quick testing
        learning_rate=2e-4,
        logging_steps=200,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        eval_strategy="steps",
        eval_steps=200,  # Evaluate every 200 steps
        save_strategy="steps",  # Save checkpoints based on evaluation
        save_steps=200,  # Save every 200 steps
        load_best_model_at_end=True,  # Load best model at the end of training
        metric_for_best_model="eval_loss",  # Use evaluation loss to determine best model
        greater_is_better=False,  # Lower loss is better
        save_total_limit=3,  # Keep only the 3 best checkpoints
    ),
)

Configuring SFTTrainer with evaluation...


In [9]:
print("Starting fine-tuning process with validation...")
training_results = trainer.train()

# Print evaluation metrics
print("Training completed!")

Starting fine-tuning process with validation...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,711 | Num Epochs = 3 | Total steps = 1,017
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 32,112,640/14,000,000,000 (0.23% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
200,1.5719,1.287612
400,1.2114,1.214332
600,1.081,1.182069
800,0.9604,1.207953
1000,0.8795,1.197931


Unsloth: Not an error, but Qwen3ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


Training completed!


In [10]:
print(f"Final training metrics: {training_results.metrics}")

Final training metrics: {'train_runtime': 1461.829, 'train_samples_per_second': 5.564, 'train_steps_per_second': 0.696, 'total_flos': 5.716715497264128e+16, 'train_loss': 1.136481964013804}


## Model inference

In [11]:
print("Setting up model for inference...")
unsloth.FastModel.for_inference(model)  # Enable native 2x faster inference


def generate_response(
    model, tokenizer, query, temperature=0.7, top_p=0.9, max_new_tokens=256
):
    """
    Generate a response from the fine-tuned model.

    Args:
        model: The fine-tuned model
        tokenizer: The tokenizer
        query: The user query/question
        temperature: Controls randomness in generation (lower = more deterministic)
        top_p: Nucleus sampling parameter (lower = more focused)
        max_new_tokens: Maximum new tokens to generate

    Returns:
        Generated response text
    """
    # Format the query as a chat message
    messages = [{"role": "user", "content": query}]

    # Prepare model inputs
    inputs = tokenizer.apply_chat_template(
        messages, add_generation_prompt=True, return_tensors="pt"
    ).to("cuda")

    # Create attention mask (all 1s) with the same shape as inputs
    attention_mask = torch.ones_like(inputs).to("cuda")

    # Configure generation parameters
    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        remove_invalid_values=True,
        # Disable thinking tags
        suppression_tokens=(
            [
                tokenizer.encode("<think>", add_special_tokens=False)[0],
                tokenizer.encode("</think>", add_special_tokens=False)[0],
            ]
            if len(tokenizer.encode("<think>", add_special_tokens=False)) > 0
            else None
        ),
    )

    # Custom text filtering function
    def filter_thinking(text):
        # Remove anything between <think> and </think> tags
        text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
        # Remove any remaining <think> or </think> tags
        text = re.sub(r"<think>|</think>", "", text)
        return text

    # Custom streamer class to filter thinking tags
    class FilteredTextStreamer(TextStreamer):
        def on_finalized_text(self, text: str, stream_end: bool = False):
            filtered_text = filter_thinking(text)
            if filtered_text.strip():  # Only print non-empty text
                print(filtered_text, end="", flush=True)

    # Initialize filtered text streamer
    streamer = FilteredTextStreamer(
        tokenizer, skip_prompt=True, skip_special_tokens=True
    )

    # Display query
    print(f"User: {query}")
    print("Assistant:")

    # Generate response
    output = model.generate(
        inputs,
        attention_mask=attention_mask,
        generation_config=generation_config,
        streamer=streamer,
        return_dict_in_generate=True,
        output_scores=False,
    )

    # For non-streaming use (optional):
    # output_text = tokenizer.decode(output.sequences[0], skip_special_tokens=True)
    # return filter_thinking(output_text)

    print("\n")  # Add a newline after generation
    return None  # Since we're streaming, we don't return the output

Setting up model for inference...


In [16]:
# Test the model with sample queries
print("\n--- Testing Model Responses ---")

test_queries = [
    "What's the best strategy for Cyclops in Bullet Echo?",
    "How does the Stalker's invisibility work in the game?",
    "Which heroes are effective against Bastion in Bullet Echo?",
]

for query in test_queries:
    generate_response(model, tokenizer, query)


--- Testing Model Responses ---
User: What's the best strategy for Cyclops in Bullet Echo?
Assistant:
The best strategy for Cyclops is to stay hidden, as this is his greatest strength. He excels in ambush tactics, allowing him to surprise enemies and maximize his effectiveness in stealthy encounters.

User: How does the Stalker's invisibility work in the game?
Assistant:
The Stalker uses a special ability called invisibility, which makes the character temporarily undetectable by opponents. This is often used for stealth movements and surprise attacks.

User: Which heroes are effective against Bastion in Bullet Echo?
Assistant:
Heroes with high damage and quick movement, such as Levi, Blot, and Lynx, are effective against Bastion due to their ability to deal quick hits and outmaneuver his slow but powerful attacks.



## Save model

In [14]:
print("\nSaving fine-tuned model...")
output_model_name = "qwen3-bullet-echo-qa-lora"
model.save_pretrained(output_model_name)
tokenizer.save_pretrained(output_model_name)
print(f"Model successfully saved to: ./{output_model_name}")

# Optional: Push to Hugging Face Hub
# from huggingface_hub import login
# login()
# hub_model_id = f"your-hf-username/{output_model_name}"
# model.push_to_hub(hub_model_id)
# tokenizer.push_to_hub(hub_model_id)
# print(f"Model pushed to Hugging Face Hub: {hub_model_id}")

print("\n🦥 Fine-tuning script completed successfully! 🦥")


Saving fine-tuned model...
Model successfully saved to: ./qwen3-bullet-echo-qa-lora

🦥 Fine-tuning script completed successfully! 🦥


## Load saved model

In [17]:
print("\n--- Loading Saved Fine-tuned Model ---")

# Load the saved model and tokenizer
saved_model_path = output_model_name  # "qwen3-bullet-echo-qa-lora"
loaded_model, loaded_tokenizer = FastModel.from_pretrained(
    model_name=output_model_name,
    max_seq_length=2048,
    load_in_4bit=True,
    full_finetuning=False,
)

# Enable faster inference
unsloth.FastModel.for_inference(loaded_model)

print("Model successfully loaded for inference!")

# Test with new queries
print("\n--- Testing Loaded Model Responses ---")

new_test_queries = [
    "What's the best strategy for Cyclops in Bullet Echo?",
    "How does the Stalker's invisibility work in the game?",
    "Which heroes are effective against Bastion in Bullet Echo?",
]

for query in new_test_queries:
    generate_response(loaded_model, loaded_tokenizer, query, temperature=0.2)

print("\n🦥 Model loading and inference testing completed! 🦥")



--- Loading Saved Fine-tuned Model ---
==((====))==  Unsloth 2025.5.2: Fast Qwen3 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA L40S. Num GPUs = 1. Max memory: 44.521 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0.dev20250319+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model successfully loaded for inference!

--- Testing Loaded Model Responses ---
User: What's the best strategy for Cyclops in Bullet Echo?
Assistant:
The best strategy for Cyclops is to stay hidden and avoid direct confrontation, using his stealth and invisibility to ambush enemies or escape dangerous situations.

User: How does the Stalker's invisibility work in the game?
Assistant:
The Stalker can become invisible for a limited time, allowing it to move undetected across the map. This invisibility can be used to avoid enemies, set up ambushes, or escape dangerous situations.

User: Which heroes are effective against Bastion in Bullet Echo?
Assistant:
Heroes with high damage and mobility, such as Lynx, Slayer, and Stalker, are effective against Bastion due to his low health and slow movement speed.


🦥 Model loading and inference testing completed! 🦥
