In [1]:
# %%
from qwen_finetuning import QwenFineTuningConfig, QwenFineTuning


In [None]:
config = QwenFineTuningConfig(
    model_name="Qwen/Qwen3-8B",
    train_file="data/train_thinking.jsonl",
    output_dir="./model/thinking_v1",
    batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=8e-5,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    num_epochs=1,
    max_length=1536,
    lora_r=24,
    lora_alpha=48,
    lora_dropout=0.1,
    gradient_checkpointing=True,
    enable_thinking=True,
)

In [3]:

config.print_config()


Model: Qwen/Qwen3-8B
Learning rate: 8e-05, Epochs: 1
Batch size: 1 (effective: 16)
LoRA: r=24, alpha=48, dropout=0.1
Thinking mode: Enabled
Max length: 1536


In [4]:

# %%
# Create fine-tuning instance
finetuner = QwenFineTuning(config)


In [5]:

# %%
# Load data
train_data = finetuner.load_jsonl(config.train_file)
print(f"Training samples: {len(train_data)}")


Training samples: 9155


In [6]:

# %%
# Run training
finetuner.setup_model()


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

trainable params: 65,470,464 || all params: 8,256,205,824 || trainable%: 0.7930


In [7]:

# %%
finetuner.setup_trainer(train_data)


Formatting: 100%|██████████| 9155/9155 [00:01<00:00, 5478.92it/s]


Dataset prepared: 9155 examples
Examples with thinking content: 9155/9155




Adding EOS to train dataset:   0%|          | 0/9155 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/9155 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/9155 [00:00<?, ? examples/s]

✓ Trainer ready (9155 samples, 572 steps)
✓ Thinking mode: Enabled


In [8]:
import torch
import gc
torch.cuda.empty_cache()
gc.collect()
print(f"GPU memory allocated: {torch.cuda.memory_allocated()/1e9:.2f} GB")
print(f"GPU memory reserved: {torch.cuda.memory_reserved()/1e9:.2f} GB")

GPU memory allocated: 16.65 GB
GPU memory reserved: 16.67 GB


In [9]:

# %%
finetuner.train()



Starting training...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss
50,0.7625
100,0.5533
150,0.535


✓ Training completed


In [10]:

# %%
finetuner.save_model()


✓ Model saved to ./model/thinking_v1


In [11]:

# %%
print(f"\n✅ Training complete! Model saved to: {config.output_dir}")


✅ Training complete! Model saved to: ./model/thinking_v1
