In [1]:
# %%
from qwen_finetuning import QwenFineTuningConfig, QwenFineTuning, ThinkingMode


In [2]:
config = QwenFineTuningConfig(
    model_name="Qwen/Qwen3-8B",
    train_file="data/mixed_distributed.jsonl",  # Use pre-distributed dataset
    output_dir="./model/mixed_v1",
    thinking_mode=ThinkingMode.MIXED,       # Enable mixed thinking/non-thinking training
    batch_size=8,                           # Higher batch size for RTX 6000 48GB VRAM
    gradient_accumulation_steps=4,          # Effective batch = 32
    learning_rate=4e-5,                     # Will be auto-adjusted to 3.2e-5 for mixed mode
    warmup_ratio=0.1,                       # Standard warmup for mixed training
    lr_scheduler_type="cosine",
    num_epochs=1,                           # 1 epoch sufficient for 92K samples
    lora_r=24,
    lora_alpha=48,
    lora_dropout=0.1,
    gradient_checkpointing=True,            # Keep for memory safety
)

In [3]:

# %%
config.print_config()


Model: Qwen/Qwen3-8B
Learning rate: 3.2000000000000005e-05, Epochs: 1
Batch size: 4 (effective: 16)
LoRA: r=24, alpha=48, dropout=0.1
Thinking mode: mixed
Max length: 2048


In [4]:

# %%
# Create fine-tuning instance
finetuner = QwenFineTuning(config)


In [5]:
train_data = finetuner.load_jsonl(config.train_file)
print(f"Training samples: {len(train_data)}")

Training samples: 92291


In [6]:
finetuner.setup_model()

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

trainable params: 65,470,464 || all params: 8,256,205,824 || trainable%: 0.7930


In [7]:
finetuner.setup_trainer(train_data)

Formatting: 100%|██████████| 92291/92291 [00:10<00:00, 8622.52it/s]


Dataset prepared: 92291 examples
Mixed training: 21108 thinking, 71183 non-thinking
Original data had 21108 examples with thinking content




Adding EOS to train dataset:   0%|          | 0/92291 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/92291 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/92291 [00:00<?, ? examples/s]

✓ Trainer ready (92291 samples, 5768 steps)
✓ Thinking mode: mixed
✓ Effective batch size: 16


In [8]:

# %%
finetuner.train()



Starting training with mixed mode...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss
50,1.2349
100,0.9591
150,0.9177
200,0.9015
250,0.8624
300,0.8771
350,0.8753
400,0.8718


✓ Training completed


In [9]:

# %%
finetuner.save_model()


✓ Model saved to ./model/mixed_v1


In [10]:

# %%
print(f"\n✅ Mixed training complete!")
print(f"Model saved to: {config.output_dir}")
print(f"Training mode: {config.thinking_mode.value}")
print(f"Effective batch size: {config.effective_batch_size}")
print(f"Max sequence length: {config.max_length}")


✅ Mixed training complete!
Model saved to: ./model/mixed_v1
Training mode: mixed
Effective batch size: 16
Max sequence length: 2048
