diff --git a/README.md b/README.md index 5a5b1371db..d21b725630 100644 --- a/README.md +++ b/README.md @@ -128,7 +128,7 @@ Running Environment: | python | >=3.9 | 3.10/3.11 | | | cuda | | cuda12 | No need to install if using CPU, NPU, MPS | | torch | >=2.0 | 2.7.1 | | -| transformers | >=4.33 | 4.56.1 | | +| transformers | >=4.33 | 4.56.2 | | | modelscope | >=1.23 | | | | peft | >=0.11,<0.18 | | | | flash_attn | | 2.7.4.post1/3.0.0b1 | | diff --git a/README_CN.md b/README_CN.md index aa1b28365d..bc7154cccb 100644 --- a/README_CN.md +++ b/README_CN.md @@ -124,7 +124,7 @@ pip install -e . | python | >=3.9 | 3.10/3.11 | | | cuda | | cuda12 | 使用cpu、npu、mps则无需安装 | | torch | >=2.0 | 2.7.1 | | -| transformers | >=4.33 | 4.56.1 | | +| transformers | >=4.33 | 4.56.2 | | | modelscope | >=1.23 | | | | peft | >=0.11,<0.18 | | | | flash_attn | | 2.7.4.post1/3.0.0b1 | | diff --git "a/docs/source/GetStarted/SWIFT\345\256\211\350\243\205.md" "b/docs/source/GetStarted/SWIFT\345\256\211\350\243\205.md" index b1bd769449..97be6dd506 100644 --- "a/docs/source/GetStarted/SWIFT\345\256\211\350\243\205.md" +++ "b/docs/source/GetStarted/SWIFT\345\256\211\350\243\205.md" @@ -99,7 +99,7 @@ modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu2 | python | >=3.9 | 3.10/3.11 | | | cuda | | cuda12 | 使用cpu、npu、mps则无需安装 | | torch | >=2.0 | 2.7.1 | | -| transformers | >=4.33 | 4.56.1 | | +| transformers | >=4.33 | 4.56.2 | | | modelscope | >=1.23 | | | | peft | >=0.11,<0.18 | | | | flash_attn | | 2.7.4.post1/3.0.0b1 | | diff --git "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" index aaa435663c..4a5ff7e9f7 100644 --- "a/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/Instruction/\346\224\257\346\214\201\347\232\204\346\250\241\345\236\213\345\222\214\346\225\260\346\215\256\351\233\206.md" @@ -706,8 +706,14 @@ |[Qwen/Qwen3-Omni-30B-A3B-Captioner](https://modelscope.cn/models/Qwen/Qwen3-Omni-30B-A3B-Captioner)|qwen3_omni|qwen3_omni|transformers>=4.57.dev0, soundfile, decord, qwen_omni_utils|✔|vision, video, audio|[Qwen/Qwen3-Omni-30B-A3B-Captioner](https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Captioner)| |[Qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B-Instruct)|qwen2_audio|qwen2_audio|transformers>=4.45,<4.49, librosa|✘|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)| |[Qwen/Qwen2-Audio-7B](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B)|qwen2_audio|qwen2_audio|transformers>=4.45,<4.49, librosa|✘|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)| +|[Qwen/Qwen3-VL-30B-A3B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Instruct)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)| +|[Qwen/Qwen3-VL-30B-A3B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Thinking)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-30B-A3B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Thinking)| +|[Qwen/Qwen3-VL-30B-A3B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Instruct-FP8)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|✘|vision, video|[Qwen/Qwen3-VL-30B-A3B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct-FP8)| +|[Qwen/Qwen3-VL-30B-A3B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Thinking-FP8)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|✘|vision, video|[Qwen/Qwen3-VL-30B-A3B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Thinking-FP8)| |[Qwen/Qwen3-VL-235B-A22B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-235B-A22B-Instruct)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-235B-A22B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct)| |[Qwen/Qwen3-VL-235B-A22B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-235B-A22B-Thinking)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-235B-A22B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Thinking)| +|[Qwen/Qwen3-VL-235B-A22B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-235B-A22B-Instruct-FP8)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|✘|vision, video|[Qwen/Qwen3-VL-235B-A22B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct-FP8)| +|[Qwen/Qwen3-VL-235B-A22B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-235B-A22B-Thinking-FP8)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|✘|vision, video|[Qwen/Qwen3-VL-235B-A22B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Thinking-FP8)| |[Qwen/QVQ-72B-Preview](https://modelscope.cn/models/Qwen/QVQ-72B-Preview)|qvq|qvq|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|✘|vision, video|[Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview)| |[iic/gme-Qwen2-VL-2B-Instruct](https://modelscope.cn/models/iic/gme-Qwen2-VL-2B-Instruct)|qwen2_gme|qwen2_gme|-|✘|vision|[Alibaba-NLP/gme-Qwen2-VL-2B-Instruct](https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct)| |[iic/gme-Qwen2-VL-7B-Instruct](https://modelscope.cn/models/iic/gme-Qwen2-VL-7B-Instruct)|qwen2_gme|qwen2_gme|-|✘|vision|[Alibaba-NLP/gme-Qwen2-VL-7B-Instruct](https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct)| diff --git "a/docs/source/Megatron-SWIFT/\345\277\253\351\200\237\345\274\200\345\247\213.md" "b/docs/source/Megatron-SWIFT/\345\277\253\351\200\237\345\274\200\345\247\213.md" index 3eaf481e6f..f5d1a3cc13 100644 --- "a/docs/source/Megatron-SWIFT/\345\277\253\351\200\237\345\274\200\345\247\213.md" +++ "b/docs/source/Megatron-SWIFT/\345\277\253\351\200\237\345\274\200\345\247\213.md" @@ -67,7 +67,7 @@ modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu2 | apex | | 0.1 | | | megatron_core | >=0.12 | 0.13 | | | flash_attn | | 2.7.4.post1/3.0.0b1 | | -| transformers | >=4.33 | 4.56.1 | | +| transformers | >=4.33 | 4.56.2 | | | modelscope | >=1.23 | | | | peft | >=0.11,<0.18 | | LoRA | | trl | >=0.15,<0.21 | | RLHF | diff --git a/docs/source_en/GetStarted/SWIFT-installation.md b/docs/source_en/GetStarted/SWIFT-installation.md index 50bbbddb18..92293cf845 100644 --- a/docs/source_en/GetStarted/SWIFT-installation.md +++ b/docs/source_en/GetStarted/SWIFT-installation.md @@ -100,7 +100,7 @@ More images can be found [here](https://modelscope.cn/docs/intro/environment-set | python | >=3.9 | 3.10/3.11 | | | cuda | | cuda12 | No need to install if using CPU, NPU, MPS | | torch | >=2.0 | 2.7.1 | | -| transformers | >=4.33 | 4.56.1 | | +| transformers | >=4.33 | 4.56.2 | | | modelscope | >=1.23 | | | | peft | >=0.11,<0.18 | | | | flash_attn | | 2.7.4.post1/3.0.0b1 | | diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md index 036fcb19e8..0c8e2101c2 100644 --- a/docs/source_en/Instruction/Supported-models-and-datasets.md +++ b/docs/source_en/Instruction/Supported-models-and-datasets.md @@ -706,8 +706,14 @@ The table below introduces the models integrated with ms-swift: |[Qwen/Qwen3-Omni-30B-A3B-Captioner](https://modelscope.cn/models/Qwen/Qwen3-Omni-30B-A3B-Captioner)|qwen3_omni|qwen3_omni|transformers>=4.57.dev0, soundfile, decord, qwen_omni_utils|✔|vision, video, audio|[Qwen/Qwen3-Omni-30B-A3B-Captioner](https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Captioner)| |[Qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B-Instruct)|qwen2_audio|qwen2_audio|transformers>=4.45,<4.49, librosa|✘|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)| |[Qwen/Qwen2-Audio-7B](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B)|qwen2_audio|qwen2_audio|transformers>=4.45,<4.49, librosa|✘|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)| +|[Qwen/Qwen3-VL-30B-A3B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Instruct)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)| +|[Qwen/Qwen3-VL-30B-A3B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Thinking)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-30B-A3B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Thinking)| +|[Qwen/Qwen3-VL-30B-A3B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Instruct-FP8)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|✘|vision, video|[Qwen/Qwen3-VL-30B-A3B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct-FP8)| +|[Qwen/Qwen3-VL-30B-A3B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Thinking-FP8)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|✘|vision, video|[Qwen/Qwen3-VL-30B-A3B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Thinking-FP8)| |[Qwen/Qwen3-VL-235B-A22B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-235B-A22B-Instruct)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-235B-A22B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct)| |[Qwen/Qwen3-VL-235B-A22B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-235B-A22B-Thinking)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-235B-A22B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Thinking)| +|[Qwen/Qwen3-VL-235B-A22B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-235B-A22B-Instruct-FP8)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|✘|vision, video|[Qwen/Qwen3-VL-235B-A22B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct-FP8)| +|[Qwen/Qwen3-VL-235B-A22B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-235B-A22B-Thinking-FP8)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|✘|vision, video|[Qwen/Qwen3-VL-235B-A22B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Thinking-FP8)| |[Qwen/QVQ-72B-Preview](https://modelscope.cn/models/Qwen/QVQ-72B-Preview)|qvq|qvq|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|✘|vision, video|[Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview)| |[iic/gme-Qwen2-VL-2B-Instruct](https://modelscope.cn/models/iic/gme-Qwen2-VL-2B-Instruct)|qwen2_gme|qwen2_gme|-|✘|vision|[Alibaba-NLP/gme-Qwen2-VL-2B-Instruct](https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct)| |[iic/gme-Qwen2-VL-7B-Instruct](https://modelscope.cn/models/iic/gme-Qwen2-VL-7B-Instruct)|qwen2_gme|qwen2_gme|-|✘|vision|[Alibaba-NLP/gme-Qwen2-VL-7B-Instruct](https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct)| diff --git a/docs/source_en/Megatron-SWIFT/Quick-start.md b/docs/source_en/Megatron-SWIFT/Quick-start.md index b3a36f1e85..8e9c1cf8d6 100644 --- a/docs/source_en/Megatron-SWIFT/Quick-start.md +++ b/docs/source_en/Megatron-SWIFT/Quick-start.md @@ -67,7 +67,7 @@ Recommended Operating Environment: | apex | | 0.1 | | | megatron_core | >=0.12 | 0.13 | | | flash_attn | | 2.7.4.post1/3.0.0b1 | | -| transformers | >=4.33 | 4.56.1 | | +| transformers | >=4.33 | 4.56.2 | | | modelscope | >=1.23 | | | | peft | >=0.11,<0.18 | | LoRA | | trl | >=0.15,<0.21 | | RLHF | diff --git a/examples/models/qwen3_vl/zero2.sh b/examples/models/qwen3_vl/zero2.sh new file mode 100644 index 0000000000..1b38e43ea3 --- /dev/null +++ b/examples/models/qwen3_vl/zero2.sh @@ -0,0 +1,36 @@ +# zero2: 70GiB +IMAGE_MAX_TOKEN_NUM=1024 \ +NPROC_PER_NODE=2 \ +CUDA_VISIBLE_DEVICES=0,1 \ +swift sft \ + --model Qwen/Qwen3-VL-30B-A3B-Instruct \ + --dataset 'AI-ModelScope/LaTeX_OCR:human_handwrite#20000' \ + --load_from_cache_file true \ + --split_dataset_ratio 0.01 \ + --train_type lora \ + --torch_dtype bfloat16 \ + --num_train_epochs 1 \ + --per_device_train_batch_size 16 \ + --per_device_eval_batch_size 16 \ + --attn_impl flash_attn \ + --padding_free true \ + --learning_rate 1e-4 \ + --lora_rank 8 \ + --lora_alpha 32 \ + --target_modules all-linear \ + --router_aux_loss_coef 1e-3 \ + --freeze_vit true \ + --freeze_aligner true \ + --gradient_accumulation_steps 1 \ + --gradient_checkpointing true \ + --eval_steps 100 \ + --save_steps 100 \ + --save_total_limit 2 \ + --logging_steps 5 \ + --max_length 2048 \ + --output_dir output \ + --warmup_ratio 0.05 \ + --deepspeed zero2 \ + --use_liger_kernel true \ + --dataset_num_proc 4 \ + --dataloader_num_workers 4 diff --git a/swift/__init__.py b/swift/__init__.py index 059e2ea13a..fc11fd23c8 100644 --- a/swift/__init__.py +++ b/swift/__init__.py @@ -12,9 +12,8 @@ PromptEncoderConfig, PromptLearningConfig, PromptTuningConfig, get_peft_config, get_peft_model, get_peft_model_state_dict, Prompt, PromptConfig, PromptModule, SwiftConfig, SwiftOutput, Swift, SwiftTuners, LongLoRAConfig, LongLoRA, LongLoRAModelType, SCETuning, SCETuningConfig) - from .trainers import (EvaluationStrategy, FSDPOption, HPSearchBackend, HubStrategy, IntervalStrategy, - SchedulerType, ShardedDDPOption, TrainingArguments, Seq2SeqTrainingArguments, Trainer, - Seq2SeqTrainer) + from .trainers import (FSDPOption, HPSearchBackend, HubStrategy, IntervalStrategy, SchedulerType, ShardedDDPOption, + TrainingArguments, Seq2SeqTrainingArguments, Trainer, Seq2SeqTrainer) from .utils import get_logger else: _import_structure = { @@ -29,7 +28,6 @@ 'Swift', 'SwiftTuners', 'LongLoRAConfig', 'LongLoRA', 'LongLoRAModelType', 'SCETuning', 'SCETuningConfig' ], 'trainers': [ - 'EvaluationStrategy', 'FSDPOption', 'HPSearchBackend', 'HubStrategy', diff --git a/swift/llm/model/model/qwen.py b/swift/llm/model/model/qwen.py index 48b485549e..568fbb4e53 100644 --- a/swift/llm/model/model/qwen.py +++ b/swift/llm/model/model/qwen.py @@ -826,6 +826,20 @@ def get_model_tokenizer_qwen2_5_vl(*args, **kwargs): tags=['vision', 'video'])) +def patch_Qwen3VLMoeTextExperts_dtype(): + from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextExperts + if hasattr(Qwen3VLMoeTextExperts, '_patch'): + return + Qwen3VLMoeTextExperts._patch = True + origin_forward = Qwen3VLMoeTextExperts.forward + + def forward(self, hidden_states, *args, **kwargs): + res = origin_forward(self, hidden_states, *args, **kwargs) + return res.to(hidden_states.dtype) + + Qwen3VLMoeTextExperts.forward = forward + + def get_model_tokenizer_qwen3_vl(model_dir, *args, **kwargs): from transformers import Qwen3VLForConditionalGeneration require_version('qwen_vl_utils>=0.0.14') @@ -850,6 +864,7 @@ def get_model_tokenizer_qwen3_moe_vl(model_dir, *args, **kwargs): require_version('qwen_vl_utils>=0.0.14') kwargs['automodel_class'] = kwargs['automodel_class'] or Qwen3VLMoeForConditionalGeneration kwargs['_check_qwen_vl_utils'] = False + patch_Qwen3VLMoeTextExperts_dtype() return get_model_tokenizer_qwen2_vl(model_dir, *args, **kwargs) @@ -857,8 +872,14 @@ def get_model_tokenizer_qwen3_moe_vl(model_dir, *args, **kwargs): ModelMeta( MLLMModelType.qwen3_moe_vl, [ ModelGroup([ + Model('Qwen/Qwen3-VL-30B-A3B-Instruct', 'Qwen/Qwen3-VL-30B-A3B-Instruct'), + Model('Qwen/Qwen3-VL-30B-A3B-Thinking', 'Qwen/Qwen3-VL-30B-A3B-Thinking'), + Model('Qwen/Qwen3-VL-30B-A3B-Instruct-FP8', 'Qwen/Qwen3-VL-30B-A3B-Instruct-FP8'), + Model('Qwen/Qwen3-VL-30B-A3B-Thinking-FP8', 'Qwen/Qwen3-VL-30B-A3B-Thinking-FP8'), Model('Qwen/Qwen3-VL-235B-A22B-Instruct', 'Qwen/Qwen3-VL-235B-A22B-Instruct'), Model('Qwen/Qwen3-VL-235B-A22B-Thinking', 'Qwen/Qwen3-VL-235B-A22B-Thinking'), + Model('Qwen/Qwen3-VL-235B-A22B-Instruct-FP8', 'Qwen/Qwen3-VL-235B-A22B-Instruct-FP8'), + Model('Qwen/Qwen3-VL-235B-A22B-Thinking-FP8', 'Qwen/Qwen3-VL-235B-A22B-Thinking-FP8'), ]), ], TemplateType.qwen3_vl, diff --git a/swift/trainers/__init__.py b/swift/trainers/__init__.py index 0e353ff527..720750554d 100644 --- a/swift/trainers/__init__.py +++ b/swift/trainers/__init__.py @@ -2,8 +2,7 @@ from typing import TYPE_CHECKING from transformers.trainer_callback import TrainerCallback -from transformers.trainer_utils import (EvaluationStrategy, FSDPOption, HPSearchBackend, HubStrategy, IntervalStrategy, - SchedulerType) +from transformers.trainer_utils import FSDPOption, HPSearchBackend, HubStrategy, IntervalStrategy, SchedulerType from swift.utils.import_utils import _LazyModule from . import callback diff --git a/swift/trainers/mixin.py b/swift/trainers/mixin.py index e329348e41..81e6f89d40 100644 --- a/swift/trainers/mixin.py +++ b/swift/trainers/mixin.py @@ -34,7 +34,7 @@ from transformers.integrations import is_deepspeed_zero3_enabled from transformers.modeling_utils import unwrap_model from transformers.trainer import (OPTIMIZER_NAME, PREFIX_CHECKPOINT_DIR, SCHEDULER_NAME, TRAINER_STATE_NAME, - ParallelMode, TrainerCallback, reissue_pt_warnings) + ParallelMode, Trainer, TrainerCallback, reissue_pt_warnings) from transformers.trainer_utils import IntervalStrategy from swift.hub import get_hub @@ -102,6 +102,9 @@ def _get_mean_metric(): self.model_meta = model.model_meta kwargs.update(self.create_loss_and_metric(args)) + trainer_parameters = inspect.signature(Trainer.__init__).parameters + tokenizer_key = 'processing_class' if 'processing_class' in trainer_parameters else 'tokenizer' + kwargs[tokenizer_key] = template.tokenizer with self.hub.patch_hub(): super().__init__( model=model, @@ -109,7 +112,6 @@ def _get_mean_metric(): data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, - tokenizer=template.tokenizer, model_init=model_init, callbacks=callbacks, optimizers=optimizers, @@ -130,6 +132,11 @@ def _get_mean_metric(): # so reading train_state is skipped here. self.args.resume_from_checkpoint = None + @property + def tokenizer(self): + # compat transformers5.0 + return self.processing_class + @contextmanager def _patch_deepspeed_load_checkpoint(self): from transformers import trainer