In [1]:
from qwen_finetuning import QwenFineTuningConfig, QwenFineTuning

In [2]:
config = QwenFineTuningConfig(
    model_name="Qwen/Qwen3-8B",
    train_file="data/train.jsonl",
    output_dir="./results_optimized",
    
    batch_size=4,                   # Still optimal for 24GB VRAM
    gradient_accumulation_steps=2, # Effective batch size = 16
    learning_rate=2e-4,             # Still optimal
    num_epochs=1,                   # Increased from 2 with optimizations
    max_length=512,
    lora_r=16,
    lora_alpha=32,
    
    # AUTOMATIC OPTIMIZATIONS (these are now defaults):
    # - use_rslora=True              # ðŸš€ RSLoRA for 5-15% better performance
    # - target_modules="all-linear"  # ðŸš€ All linear layers for maximum performance  
    # - lora_dropout=0.1             # ðŸš€ Better regularization (was 0.05)
    # - lr_scheduler_type="cosine_with_restarts"  # ðŸš€ Better than linear decay
    # - warmup_ratio=0.03            # ðŸš€ Optimal warmup for your dataset size
    # - Flash Attention 2 auto-enabled with fallback
)

In [3]:
config.print_config()


âœ“ Configuration set with optimizations
Model: Qwen/Qwen3-8B
Learning rate: 0.0002
LR scheduler: cosine_with_restarts (warmup: 0.03)
Batch size: 4
Effective batch size: 8
LoRA optimizations:
  - RSLoRA enabled: True
  - Target modules: ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']
  - Rank: 16, Alpha: 32, Dropout: 0.1
Dataset processing cores: 4
Cache writer batch size: 500
DataLoader workers: 4
DataLoader optimizations: pin_memory=True, persistent_workers=True
GPU cache management: empty every 4 steps


In [4]:
# Create fine-tuning instance
finetuner = QwenFineTuning(config)


âœ“ Environment loaded, HF token available


In [5]:
# Load training data
train_data = finetuner.load_jsonl(config.train_file)


In [6]:
finetuner.run_complete_finetuning(train_data=train_data)


Train Dataset: 86929 examples
Categories: unknown(86929)
Answer distribution: A(24205), B(24441), C(24625), D(11146), E(2512)
Loading model and tokeniser with optimizations...


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

âœ“ Optimized LoRA configuration applied:
trainable params: 43,646,976 || all params: 8,234,382,336 || trainable%: 0.5301
  - RSLoRA enabled: True
  - Target modules: ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']

Example prompt format:
<|im_start|>user
Domanda: Il diario clinico ha lo scopo di:...

A) Permettere la ricostruzione del decorso clinico del residente documentando le scelt...

Optimizations enabled:
  - RSLoRA: True
  - Target modules: ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']
  - LR scheduler: cosine_with_restarts
  - Warmup ratio: 0.03
  - Dataset processing: 4 CPU cores
  - Memory-efficient caching: batch size 500
  - Optimized DataLoader: 4 workers, pin_memory, persistent_workers
  - GPU memory management: cache clearing every 4 steps
Setting up trainer with optimized configuration...
âœ“ Loading cached dataset from: cache/processed_datasets/342ab7c6db43e6f8df6ed1e851ed55d9
âœ“ Dataset loaded efficientl

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


âœ“ Trainer configured with optimizations
Starting optimized training...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
20,1.5416
40,1.1571
60,1.1456
80,1.0986
100,1.1262
120,1.1181
140,1.0643
160,1.0408
180,1.0871
200,1.0744


Saving model...
âœ“ Optimized training completed


In [7]:
print(f"\n1-epoch fine-tuning completed successfully")
print(f"Model saved to: {config.output_dir}")


1-epoch fine-tuning completed successfully
Model saved to: ./results_optimized
