In [1]:
from qwen_finetuning import QwenFineTuningConfig, QwenFineTuning

In [2]:
# Configuration matching original dynamics but memory-safe
config = QwenFineTuningConfig(
    model_name="Qwen/Qwen3-8B",
    train_file="data/train.jsonl",
    output_dir="./model/v2",
    batch_size=4,                       # Reduced for memory
    gradient_accumulation_steps=3,      # Effective batch = 12 (same as original)
    learning_rate=4e-5,                 # Same as original
    warmup_ratio=0.1,                   # Same as original
    lr_scheduler_type="cosine",         # Same as original
    num_epochs=2,                       # Same as original
    max_length=512,                     # Same as original
    lora_r=24,                          # Same as original
    lora_alpha=48,                      # Same as original
    lora_dropout=0.1,                   # Same as original
    gradient_checkpointing=True,        # Enable for memory savings
)

In [3]:
# Create fine-tuning instance
finetuner = QwenFineTuning(config)

In [4]:
# Load data
train_data = finetuner.load_jsonl(config.train_file)
print(f"Training samples: {len(train_data)}")

Training samples: 86929


In [5]:
# Run training
finetuner.setup_model()

config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/3.19G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

trainable params: 65,470,464 || all params: 8,256,205,824 || trainable%: 0.7930


In [6]:
finetuner.setup_trainer(train_data)

Formatting: 100%|██████████| 86929/86929 [00:13<00:00, 6419.08it/s]


Adding EOS to train dataset:   0%|          | 0/86929 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/86929 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/86929 [00:00<?, ? examples/s]

✓ Trainer ready (86929 samples, 14488 steps)


In [7]:
finetuner.train()


Starting training...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss
50,2.0076
100,1.4354
150,1.2756
200,1.2285
250,1.2178
300,1.2057
350,1.1933
400,1.1766
450,1.1746
500,1.1664


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

✓ Training completed


In [8]:
finetuner.save_model()

✓ Model saved to ./model/v2


In [9]:
print(f"\n✅ Training complete! Model saved to: {config.output_dir}")


✅ Training complete! Model saved to: ./model/v2
