modelscope · Jintao-Huang · Oct 3, 2025 · Oct 3, 2025 · Oct 3, 2025 · Oct 3, 2025
diff --git a/README.md b/README.md
@@ -128,7 +128,7 @@ Running Environment:
 | python       | >=3.9        | 3.10/3.11                |                                           |
 | cuda         |              | cuda12              | No need to install if using CPU, NPU, MPS |
 | torch        | >=2.0        | 2.7.1               |                                           |
-| transformers | >=4.33       | 4.56.1              |                                           |
+| transformers | >=4.33       | 4.56.2              |                                           |
 | modelscope   | >=1.23       |                     |                                           |
 | peft         | >=0.11,<0.18 |                     |                                           |
 | flash_attn   |              | 2.7.4.post1/3.0.0b1 |                                           |

diff --git a/README_CN.md b/README_CN.md
@@ -124,7 +124,7 @@ pip install -e .
 | python       | >=3.9        | 3.10/3.11            |                    |
 | cuda         |              | cuda12              | 使用cpu、npu、mps则无需安装 |
 | torch        | >=2.0        | 2.7.1               |                    |
-| transformers | >=4.33       | 4.56.1              |                    |
+| transformers | >=4.33       | 4.56.2              |                    |
 | modelscope   | >=1.23       |                     |                    |
 | peft         | >=0.11,<0.18 |                     |                    |
 | flash_attn   |              | 2.7.4.post1/3.0.0b1 |                    |

diff --git a/docs/source/GetStarted/SWIFT安装.md b/docs/source/GetStarted/SWIFT安装.md
@@ -99,7 +99,7 @@ modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu2
 | python       | >=3.9        | 3.10/3.11                |                    |
 | cuda         |              | cuda12              | 使用cpu、npu、mps则无需安装 |
 | torch        | >=2.0        | 2.7.1               |                    |
-| transformers | >=4.33       | 4.56.1              |                    |
+| transformers | >=4.33       | 4.56.2              |                    |
 | modelscope   | >=1.23       |                     |                    |
 | peft         | >=0.11,<0.18 |                     |                    |
 | flash_attn   |              | 2.7.4.post1/3.0.0b1 |                    |

diff --git a/docs/source/Instruction/支持的模型和数据集.md b/docs/source/Instruction/支持的模型和数据集.md
@@ -706,8 +706,14 @@
 |[Qwen/Qwen3-Omni-30B-A3B-Captioner](https://modelscope.cn/models/Qwen/Qwen3-Omni-30B-A3B-Captioner)|qwen3_omni|qwen3_omni|transformers>=4.57.dev0, soundfile, decord, qwen_omni_utils|&#x2714;|vision, video, audio|[Qwen/Qwen3-Omni-30B-A3B-Captioner](https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Captioner)|
 |[Qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B-Instruct)|qwen2_audio|qwen2_audio|transformers>=4.45,<4.49, librosa|&#x2718;|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)|
 |[Qwen/Qwen2-Audio-7B](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B)|qwen2_audio|qwen2_audio|transformers>=4.45,<4.49, librosa|&#x2718;|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)|
+|[Qwen/Qwen3-VL-30B-A3B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Instruct)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)|
+|[Qwen/Qwen3-VL-30B-A3B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Thinking)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-30B-A3B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Thinking)|
+|[Qwen/Qwen3-VL-30B-A3B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Instruct-FP8)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-30B-A3B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct-FP8)|
+|[Qwen/Qwen3-VL-30B-A3B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Thinking-FP8)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-30B-A3B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Thinking-FP8)|
 |[Qwen/Qwen3-VL-235B-A22B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-235B-A22B-Instruct)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-235B-A22B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct)|
 |[Qwen/Qwen3-VL-235B-A22B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-235B-A22B-Thinking)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-235B-A22B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Thinking)|
+|[Qwen/Qwen3-VL-235B-A22B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-235B-A22B-Instruct-FP8)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-235B-A22B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct-FP8)|
+|[Qwen/Qwen3-VL-235B-A22B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-235B-A22B-Thinking-FP8)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-235B-A22B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Thinking-FP8)|
 |[Qwen/QVQ-72B-Preview](https://modelscope.cn/models/Qwen/QVQ-72B-Preview)|qvq|qvq|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|&#x2718;|vision, video|[Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview)|
 |[iic/gme-Qwen2-VL-2B-Instruct](https://modelscope.cn/models/iic/gme-Qwen2-VL-2B-Instruct)|qwen2_gme|qwen2_gme|-|&#x2718;|vision|[Alibaba-NLP/gme-Qwen2-VL-2B-Instruct](https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct)|
 |[iic/gme-Qwen2-VL-7B-Instruct](https://modelscope.cn/models/iic/gme-Qwen2-VL-7B-Instruct)|qwen2_gme|qwen2_gme|-|&#x2718;|vision|[Alibaba-NLP/gme-Qwen2-VL-7B-Instruct](https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct)|

diff --git a/docs/source/Megatron-SWIFT/快速开始.md b/docs/source/Megatron-SWIFT/快速开始.md
@@ -67,7 +67,7 @@ modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu2
 | apex |   |  0.1 | |
 | megatron_core    | >=0.12       | 0.13      |                  |
 | flash_attn    |        | 2.7.4.post1/3.0.0b1   |                  |
-| transformers | >=4.33       | 4.56.1      |                    |
+| transformers | >=4.33       | 4.56.2      |                    |
 | modelscope   | >=1.23       |             |                    |
 | peft         | >=0.11,<0.18 |             |      LoRA          |
 | trl          | >=0.15,<0.21 |       |      RLHF        |

diff --git a/docs/source_en/GetStarted/SWIFT-installation.md b/docs/source_en/GetStarted/SWIFT-installation.md
@@ -100,7 +100,7 @@ More images can be found [here](https://modelscope.cn/docs/intro/environment-set
 | python       | >=3.9        | 3.10/3.11                |                                           |
 | cuda         |              | cuda12              | No need to install if using CPU, NPU, MPS |
 | torch        | >=2.0        | 2.7.1               |                                           |
-| transformers | >=4.33       | 4.56.1              |                                           |
+| transformers | >=4.33       | 4.56.2              |                                           |
 | modelscope   | >=1.23       |                     |                                           |
 | peft         | >=0.11,<0.18 |                     |                                           |
 | flash_attn   |              | 2.7.4.post1/3.0.0b1 |                                           |

diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md
@@ -706,8 +706,14 @@ The table below introduces the models integrated with ms-swift:
 |[Qwen/Qwen3-Omni-30B-A3B-Captioner](https://modelscope.cn/models/Qwen/Qwen3-Omni-30B-A3B-Captioner)|qwen3_omni|qwen3_omni|transformers>=4.57.dev0, soundfile, decord, qwen_omni_utils|&#x2714;|vision, video, audio|[Qwen/Qwen3-Omni-30B-A3B-Captioner](https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Captioner)|
 |[Qwen/Qwen2-Audio-7B-Instruct](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B-Instruct)|qwen2_audio|qwen2_audio|transformers>=4.45,<4.49, librosa|&#x2718;|audio|[Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct)|
 |[Qwen/Qwen2-Audio-7B](https://modelscope.cn/models/Qwen/Qwen2-Audio-7B)|qwen2_audio|qwen2_audio|transformers>=4.45,<4.49, librosa|&#x2718;|audio|[Qwen/Qwen2-Audio-7B](https://huggingface.co/Qwen/Qwen2-Audio-7B)|
+|[Qwen/Qwen3-VL-30B-A3B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Instruct)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)|
+|[Qwen/Qwen3-VL-30B-A3B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Thinking)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-30B-A3B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Thinking)|
+|[Qwen/Qwen3-VL-30B-A3B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Instruct-FP8)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-30B-A3B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct-FP8)|
+|[Qwen/Qwen3-VL-30B-A3B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-30B-A3B-Thinking-FP8)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-30B-A3B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Thinking-FP8)|
 |[Qwen/Qwen3-VL-235B-A22B-Instruct](https://modelscope.cn/models/Qwen/Qwen3-VL-235B-A22B-Instruct)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-235B-A22B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct)|
 |[Qwen/Qwen3-VL-235B-A22B-Thinking](https://modelscope.cn/models/Qwen/Qwen3-VL-235B-A22B-Thinking)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-235B-A22B-Thinking](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Thinking)|
+|[Qwen/Qwen3-VL-235B-A22B-Instruct-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-235B-A22B-Instruct-FP8)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-235B-A22B-Instruct-FP8](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Instruct-FP8)|
+|[Qwen/Qwen3-VL-235B-A22B-Thinking-FP8](https://modelscope.cn/models/Qwen/Qwen3-VL-235B-A22B-Thinking-FP8)|qwen3_moe_vl|qwen3_vl|transformers>=4.57.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2718;|vision, video|[Qwen/Qwen3-VL-235B-A22B-Thinking-FP8](https://huggingface.co/Qwen/Qwen3-VL-235B-A22B-Thinking-FP8)|
 |[Qwen/QVQ-72B-Preview](https://modelscope.cn/models/Qwen/QVQ-72B-Preview)|qvq|qvq|transformers>=4.45, qwen_vl_utils>=0.0.6, decord|&#x2718;|vision, video|[Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview)|
 |[iic/gme-Qwen2-VL-2B-Instruct](https://modelscope.cn/models/iic/gme-Qwen2-VL-2B-Instruct)|qwen2_gme|qwen2_gme|-|&#x2718;|vision|[Alibaba-NLP/gme-Qwen2-VL-2B-Instruct](https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct)|
 |[iic/gme-Qwen2-VL-7B-Instruct](https://modelscope.cn/models/iic/gme-Qwen2-VL-7B-Instruct)|qwen2_gme|qwen2_gme|-|&#x2718;|vision|[Alibaba-NLP/gme-Qwen2-VL-7B-Instruct](https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct)|

diff --git a/docs/source_en/Megatron-SWIFT/Quick-start.md b/docs/source_en/Megatron-SWIFT/Quick-start.md
@@ -67,7 +67,7 @@ Recommended Operating Environment:
 | apex |   |  0.1 | |
 | megatron_core    | >=0.12       | 0.13      |                  |
 | flash_attn    |        | 2.7.4.post1/3.0.0b1   |                  |
-| transformers | >=4.33       | 4.56.1      |                    |
+| transformers | >=4.33       | 4.56.2      |                    |
 | modelscope   | >=1.23       |             |                    |
 | peft         | >=0.11,<0.18 |             |      LoRA          |
 | trl          | >=0.15,<0.21 |       |      RLHF        |

diff --git a/examples/models/qwen3_vl/zero2.sh b/examples/models/qwen3_vl/zero2.sh
@@ -0,0 +1,36 @@
+# zero2: 70GiB
+IMAGE_MAX_TOKEN_NUM=1024 \
+NPROC_PER_NODE=2 \
+CUDA_VISIBLE_DEVICES=0,1 \
+swift sft \
+    --model Qwen/Qwen3-VL-30B-A3B-Instruct \
+    --dataset 'AI-ModelScope/LaTeX_OCR:human_handwrite#20000' \
+    --load_from_cache_file true \
+    --split_dataset_ratio 0.01 \
+    --train_type lora \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 16 \
+    --attn_impl flash_attn \
+    --padding_free true \
+    --learning_rate 1e-4 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    --router_aux_loss_coef 1e-3 \
+    --freeze_vit true \
+    --freeze_aligner true \
+    --gradient_accumulation_steps 1 \
+    --gradient_checkpointing true \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --deepspeed zero2 \
+    --use_liger_kernel true \
+    --dataset_num_proc 4 \
+    --dataloader_num_workers 4
diff --git a/swift/__init__.py b/swift/__init__.py
@@ -12,9 +12,8 @@
                          PromptEncoderConfig, PromptLearningConfig, PromptTuningConfig, get_peft_config, get_peft_model,
                          get_peft_model_state_dict, Prompt, PromptConfig, PromptModule, SwiftConfig, SwiftOutput, Swift,
                          SwiftTuners, LongLoRAConfig, LongLoRA, LongLoRAModelType, SCETuning, SCETuningConfig)
-    from .trainers import (EvaluationStrategy, FSDPOption, HPSearchBackend, HubStrategy, IntervalStrategy,
-                           SchedulerType, ShardedDDPOption, TrainingArguments, Seq2SeqTrainingArguments, Trainer,
-                           Seq2SeqTrainer)
+    from .trainers import (FSDPOption, HPSearchBackend, HubStrategy, IntervalStrategy, SchedulerType, ShardedDDPOption,
+                           TrainingArguments, Seq2SeqTrainingArguments, Trainer, Seq2SeqTrainer)
     from .utils import get_logger
 else:
     _import_structure = {
@@ -29,7 +28,6 @@
             'Swift', 'SwiftTuners', 'LongLoRAConfig', 'LongLoRA', 'LongLoRAModelType', 'SCETuning', 'SCETuningConfig'
         ],
         'trainers': [
-            'EvaluationStrategy',
             'FSDPOption',
             'HPSearchBackend',
             'HubStrategy',

diff --git a/swift/llm/model/model/qwen.py b/swift/llm/model/model/qwen.py
@@ -826,6 +826,20 @@ def get_model_tokenizer_qwen2_5_vl(*args, **kwargs):
         tags=['vision', 'video']))
 
 
+def patch_Qwen3VLMoeTextExperts_dtype():
+    from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextExperts
+    if hasattr(Qwen3VLMoeTextExperts, '_patch'):
+        return
+    Qwen3VLMoeTextExperts._patch = True
+    origin_forward = Qwen3VLMoeTextExperts.forward
+
+    def forward(self, hidden_states, *args, **kwargs):
+        res = origin_forward(self, hidden_states, *args, **kwargs)
+        return res.to(hidden_states.dtype)
+
+    Qwen3VLMoeTextExperts.forward = forward
+
+
 def get_model_tokenizer_qwen3_vl(model_dir, *args, **kwargs):
     from transformers import Qwen3VLForConditionalGeneration
     require_version('qwen_vl_utils>=0.0.14')
@@ -850,15 +864,22 @@ def get_model_tokenizer_qwen3_moe_vl(model_dir, *args, **kwargs):
     require_version('qwen_vl_utils>=0.0.14')
     kwargs['automodel_class'] = kwargs['automodel_class'] or Qwen3VLMoeForConditionalGeneration
     kwargs['_check_qwen_vl_utils'] = False
+    patch_Qwen3VLMoeTextExperts_dtype()
     return get_model_tokenizer_qwen2_vl(model_dir, *args, **kwargs)
 
 
 register_model(
     ModelMeta(
         MLLMModelType.qwen3_moe_vl, [
             ModelGroup([
+                Model('Qwen/Qwen3-VL-30B-A3B-Instruct', 'Qwen/Qwen3-VL-30B-A3B-Instruct'),
+                Model('Qwen/Qwen3-VL-30B-A3B-Thinking', 'Qwen/Qwen3-VL-30B-A3B-Thinking'),
+                Model('Qwen/Qwen3-VL-30B-A3B-Instruct-FP8', 'Qwen/Qwen3-VL-30B-A3B-Instruct-FP8'),
+                Model('Qwen/Qwen3-VL-30B-A3B-Thinking-FP8', 'Qwen/Qwen3-VL-30B-A3B-Thinking-FP8'),
                 Model('Qwen/Qwen3-VL-235B-A22B-Instruct', 'Qwen/Qwen3-VL-235B-A22B-Instruct'),
                 Model('Qwen/Qwen3-VL-235B-A22B-Thinking', 'Qwen/Qwen3-VL-235B-A22B-Thinking'),
+                Model('Qwen/Qwen3-VL-235B-A22B-Instruct-FP8', 'Qwen/Qwen3-VL-235B-A22B-Instruct-FP8'),
+                Model('Qwen/Qwen3-VL-235B-A22B-Thinking-FP8', 'Qwen/Qwen3-VL-235B-A22B-Thinking-FP8'),
             ]),
         ],
         TemplateType.qwen3_vl,

diff --git a/swift/trainers/__init__.py b/swift/trainers/__init__.py
@@ -2,8 +2,7 @@
 from typing import TYPE_CHECKING
 
 from transformers.trainer_callback import TrainerCallback
-from transformers.trainer_utils import (EvaluationStrategy, FSDPOption, HPSearchBackend, HubStrategy, IntervalStrategy,
-                                        SchedulerType)
+from transformers.trainer_utils import FSDPOption, HPSearchBackend, HubStrategy, IntervalStrategy, SchedulerType
 
 from swift.utils.import_utils import _LazyModule
 from . import callback

diff --git a/swift/trainers/mixin.py b/swift/trainers/mixin.py
@@ -34,7 +34,7 @@
 from transformers.integrations import is_deepspeed_zero3_enabled
 from transformers.modeling_utils import unwrap_model
 from transformers.trainer import (OPTIMIZER_NAME, PREFIX_CHECKPOINT_DIR, SCHEDULER_NAME, TRAINER_STATE_NAME,
-                                  ParallelMode, TrainerCallback, reissue_pt_warnings)
+                                  ParallelMode, Trainer, TrainerCallback, reissue_pt_warnings)
 from transformers.trainer_utils import IntervalStrategy
 
 from swift.hub import get_hub
@@ -102,14 +102,16 @@ def _get_mean_metric():
         self.model_meta = model.model_meta
 
         kwargs.update(self.create_loss_and_metric(args))
+        trainer_parameters = inspect.signature(Trainer.__init__).parameters
+        tokenizer_key = 'processing_class' if 'processing_class' in trainer_parameters else 'tokenizer'
+        kwargs[tokenizer_key] = template.tokenizer
         with self.hub.patch_hub():
             super().__init__(
                 model=model,
                 args=args,
                 data_collator=data_collator,
                 train_dataset=train_dataset,
                 eval_dataset=eval_dataset,
-                tokenizer=template.tokenizer,
                 model_init=model_init,
                 callbacks=callbacks,
                 optimizers=optimizers,
@@ -130,6 +132,11 @@ def _get_mean_metric():
             # so reading train_state is skipped here.
             self.args.resume_from_checkpoint = None
 
+    @property
+    def tokenizer(self):
+        # compat transformers5.0
+        return self.processing_class
+
     @contextmanager
     def _patch_deepspeed_load_checkpoint(self):
         from transformers import trainer